[llvm] [RISCV] Default to MicroOpBufferSize = 1 for scheduling purposes (PR #126608)

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 12 12:30:21 PST 2025


https://github.com/preames updated https://github.com/llvm/llvm-project/pull/126608

>From d2857c4ab8c6304cbef8b99ce4cf6ac3a1d057f9 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Mon, 10 Feb 2025 08:31:37 -0800
Subject: [PATCH] [RISCV] Default to MicroOpBufferSize = 1 for scheduling
 purposes

This change introduces a default schedule model for the RISCV target
which leaves everything unchanged except the MicroOpBufferSize.  The
default value of this flag in NoSched is 0.  Both configurations
represent in order cores (i.e. no reorder window), the difference
between them comes down to whether heuristics other than latency
are allowed to apply.  (Implementation details below)

I left the processor models which explicitly set MicroOpBufferSize=0
unchanged in this patch, but strongly suspect we should change those
too.  Honestly, I think the LLVM wide default for this flag should be
changed, but don't have the energy to manage the updates for all
targets.

Implementation wise, the effect of this change is that schedule units
which are ready to run *except that* one of their predecessors may not
have completed yet are added to the Available list, not the Pending one.
The result of this is that it becomes possible to chose to schedule
a node before it's ready cycle if the heuristics prefer.  This is
essentially chosing to insert a resource stall instead of e.g.
increasing register pressure.

Note that I was initially concerned there might be a correctness aspect
(as in some kind of exposed pipeline design), but the generic scheduler
doesn't seem to know how to insert noop instructions.  Without that,
a program wouldn't be guaranteed to schedule on an exposed pipeline
depending on the program and schedule model in question.

The effect of this is that we sometimes prefer register pressure in
codegen results.  This is mostly churn (or small wins) on scalar
because we have many more registers, but is of major importance
on vector - particularly high LMUL - because we effectively have
many fewer registers and the relative cost of spilling is much higher.
This is a significant improvement on high LMUL code quality for
default rva23u configurations - or any non -mcpu vector configuration
for that matter.
---
 llvm/lib/Target/RISCV/RISCVProcessors.td      |   21 +-
 llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll |   40 +-
 .../RISCV/GlobalISel/combine-neg-abs.ll       |  192 +-
 .../CodeGen/RISCV/GlobalISel/double-arith.ll  |  142 +-
 .../CodeGen/RISCV/GlobalISel/float-arith.ll   |  108 +-
 llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll  |   40 +-
 .../CodeGen/RISCV/GlobalISel/rotl-rotr.ll     |    6 +-
 llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll |    2 +-
 .../test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll |    2 +-
 llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll  |    2 +-
 .../GlobalISel/stacksave-stackrestore.ll      |    4 +-
 llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll  |  136 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 3172 ++++++-----
 llvm/test/CodeGen/RISCV/abds-neg.ll           |  692 +--
 llvm/test/CodeGen/RISCV/abds.ll               |  204 +-
 llvm/test/CodeGen/RISCV/abdu-neg.ll           |  324 +-
 llvm/test/CodeGen/RISCV/abdu.ll               |  860 +--
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   32 +-
 llvm/test/CodeGen/RISCV/add-imm.ll            |   32 +-
 llvm/test/CodeGen/RISCV/alloca.ll             |   16 +-
 llvm/test/CodeGen/RISCV/alu64.ll              |    9 +-
 llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll |  136 +-
 llvm/test/CodeGen/RISCV/atomic-rmw.ll         | 4840 ++++++++---------
 llvm/test/CodeGen/RISCV/atomic-signext.ll     | 1290 +++--
 .../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll |  612 ++-
 .../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll |  630 ++-
 llvm/test/CodeGen/RISCV/bf16-promote.ll       |   20 +-
 llvm/test/CodeGen/RISCV/bfloat-convert.ll     |   96 +-
 llvm/test/CodeGen/RISCV/bfloat-mem.ll         |    8 +-
 llvm/test/CodeGen/RISCV/bfloat.ll             |   40 +-
 llvm/test/CodeGen/RISCV/bittest.ll            |  742 +--
 llvm/test/CodeGen/RISCV/branch-on-zero.ll     |   10 +-
 .../test/CodeGen/RISCV/callee-saved-fpr32s.ll |  256 +-
 .../test/CodeGen/RISCV/callee-saved-fpr64s.ll |  200 +-
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll  |  568 +-
 llvm/test/CodeGen/RISCV/calling-conv-half.ll  |   24 +-
 .../RISCV/calling-conv-ilp32-ilp32f-common.ll |  140 +-
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |  256 +-
 llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll |    4 +-
 .../test/CodeGen/RISCV/calling-conv-ilp32d.ll |   32 +-
 .../test/CodeGen/RISCV/calling-conv-ilp32e.ll |  648 +--
 .../calling-conv-ilp32f-ilp32d-common.ll      |   34 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |   92 +-
 llvm/test/CodeGen/RISCV/calling-conv-lp64.ll  |    4 +-
 llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll |    4 +-
 .../CodeGen/RISCV/calling-conv-rv32f-ilp32.ll |    6 +-
 .../RISCV/calling-conv-rv32f-ilp32e.ll        |    8 +-
 llvm/test/CodeGen/RISCV/calls.ll              |   16 +-
 llvm/test/CodeGen/RISCV/codemodel-lowering.ll |   20 +-
 llvm/test/CodeGen/RISCV/condbinops.ll         |   11 +-
 llvm/test/CodeGen/RISCV/condops.ll            |   72 +-
 llvm/test/CodeGen/RISCV/copysign-casts.ll     |   18 +-
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    |    8 +-
 .../test/CodeGen/RISCV/double-calling-conv.ll |   10 +-
 llvm/test/CodeGen/RISCV/double-convert.ll     |  128 +-
 llvm/test/CodeGen/RISCV/double-fcmp-strict.ll |   52 +-
 llvm/test/CodeGen/RISCV/double-imm.ll         |    8 +-
 llvm/test/CodeGen/RISCV/double-mem.ll         |    6 +-
 .../CodeGen/RISCV/double-round-conv-sat.ll    |   84 +-
 llvm/test/CodeGen/RISCV/double-select-fcmp.ll |   20 +-
 .../RISCV/double-stack-spill-restore.ll       |    4 +-
 llvm/test/CodeGen/RISCV/fastcc-bf16.ll        |    6 +-
 llvm/test/CodeGen/RISCV/fastcc-float.ll       |    6 +-
 llvm/test/CodeGen/RISCV/fastcc-half.ll        |    6 +-
 .../CodeGen/RISCV/fastcc-without-f-reg.ll     |  144 +-
 llvm/test/CodeGen/RISCV/float-convert.ll      |   36 +-
 llvm/test/CodeGen/RISCV/float-fcmp-strict.ll  |   32 +-
 llvm/test/CodeGen/RISCV/float-select-fcmp.ll  |    8 +-
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll |   44 +-
 llvm/test/CodeGen/RISCV/forced-atomics.ll     |  292 +-
 llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll   |   40 +-
 llvm/test/CodeGen/RISCV/fp128.ll              |  112 +-
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |  348 +-
 .../CodeGen/RISCV/get-setcc-result-type.ll    |   28 +-
 llvm/test/CodeGen/RISCV/half-arith.ll         |   32 +-
 .../test/CodeGen/RISCV/half-convert-strict.ll |   48 +-
 llvm/test/CodeGen/RISCV/half-convert.ll       |  652 +--
 llvm/test/CodeGen/RISCV/half-fcmp-strict.ll   |  120 +-
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    |   16 +-
 llvm/test/CodeGen/RISCV/half-mem.ll           |   16 +-
 llvm/test/CodeGen/RISCV/half-select-fcmp.ll   |   16 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |  136 +-
 .../RISCV/inline-asm-d-constraint-f.ll        |   16 +-
 .../CodeGen/RISCV/inline-asm-d-modifier-N.ll  |   16 +-
 .../RISCV/inline-asm-f-constraint-f.ll        |   32 +-
 .../CodeGen/RISCV/inline-asm-f-modifier-N.ll  |   32 +-
 .../RISCV/inline-asm-zfinx-constraint-r.ll    |    4 +-
 .../RISCV/inline-asm-zhinx-constraint-r.ll    |    8 +-
 llvm/test/CodeGen/RISCV/inline-asm.ll         |   16 +-
 .../RISCV/intrinsic-cttz-elts-vscale.ll       |   12 +-
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |   38 +-
 llvm/test/CodeGen/RISCV/llvm.exp10.ll         |   40 +-
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         |  326 +-
 ...op-strength-reduce-add-cheaper-than-mul.ll |    4 +-
 .../RISCV/machine-sink-load-immediate.ll      |    2 +-
 .../RISCV/machinelicm-address-pseudos.ll      |   24 +-
 .../CodeGen/RISCV/macro-fusion-lui-addi.ll    |    5 +-
 llvm/test/CodeGen/RISCV/mem.ll                |    6 +-
 llvm/test/CodeGen/RISCV/mem64.ll              |    6 +-
 llvm/test/CodeGen/RISCV/memcmp-optsize.ll     |  342 +-
 llvm/test/CodeGen/RISCV/memcmp.ll             |  342 +-
 llvm/test/CodeGen/RISCV/memmove.ll            |  124 +-
 llvm/test/CodeGen/RISCV/memset-pattern.ll     |  122 +-
 llvm/test/CodeGen/RISCV/mul.ll                |   36 +-
 llvm/test/CodeGen/RISCV/neg-abs.ll            |   56 +-
 llvm/test/CodeGen/RISCV/orc-b-patterns.ll     |   20 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |   60 +-
 llvm/test/CodeGen/RISCV/pr51206.ll            |   20 +-
 llvm/test/CodeGen/RISCV/pr58511.ll            |    4 +-
 llvm/test/CodeGen/RISCV/pr63816.ll            |    4 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            | 1157 ++--
 llvm/test/CodeGen/RISCV/push-pop-popret.ll    |  984 ++--
 .../CodeGen/RISCV/riscv-codegenprepare-asm.ll |    2 +-
 llvm/test/CodeGen/RISCV/rotl-rotr.ll          |  234 +-
 .../CodeGen/RISCV/rv32-inline-asm-pairs.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |  110 +-
 llvm/test/CodeGen/RISCV/rv32zbs.ll            |    4 +-
 .../test/CodeGen/RISCV/rv64-double-convert.ll |    2 +-
 llvm/test/CodeGen/RISCV/rv64-half-convert.ll  |   12 +-
 .../CodeGen/RISCV/rv64-inline-asm-pairs.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rv64-trampoline.ll    |    4 +-
 .../test/CodeGen/RISCV/rv64i-demanded-bits.ll |    4 +-
 llvm/test/CodeGen/RISCV/rv64zbkb.ll           |    2 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     |   20 +-
 .../rvv/alloca-load-store-scalable-array.ll   |   18 +-
 .../rvv/alloca-load-store-scalable-struct.ll  |    8 +-
 .../CodeGen/RISCV/rvv/bitreverse-sdnode.ll    |  192 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  |  346 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll   |  148 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       |  398 +-
 .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll  |  182 +-
 llvm/test/CodeGen/RISCV/rvv/calling-conv.ll   |   16 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |  305 +-
 .../RISCV/rvv/combine-store-extract-crash.ll  |   24 +-
 .../RISCV/rvv/concat-vector-insert-elt.ll     |   62 +-
 .../RISCV/rvv/constant-folding-crash.ll       |   19 +-
 llvm/test/CodeGen/RISCV/rvv/copyprop.mir      |    4 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll    |  246 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |   60 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll    |  336 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |   98 +-
 .../RISCV/rvv/dont-sink-splat-operands.ll     |   66 +-
 .../CodeGen/RISCV/rvv/double-round-conv.ll    |   96 +-
 llvm/test/CodeGen/RISCV/rvv/expandload.ll     | 1637 +++---
 llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll  |    8 +-
 llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll  |   36 +-
 .../CodeGen/RISCV/rvv/extractelt-int-rv64.ll  |    8 +-
 .../RISCV/rvv/fceil-constrained-sdnode.ll     |   50 +-
 llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll   |  118 +-
 .../RISCV/rvv/ffloor-constrained-sdnode.ll    |   50 +-
 llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll  |  118 +-
 .../rvv/fixed-vector-i8-index-cornercase.ll   |   63 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll |    2 +-
 .../rvv/fixed-vectors-bitcast-large-vector.ll |    8 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  |  382 +-
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |  308 +-
 .../rvv/fixed-vectors-buildvec-of-binop.ll    |   39 +-
 .../rvv/fixed-vectors-calling-conv-fastcc.ll  |   51 +-
 .../RISCV/rvv/fixed-vectors-calling-conv.ll   |   33 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |  168 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        | 1092 ++--
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       |  500 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll  |    8 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        | 1032 ++--
 .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll   |   24 +-
 .../rvv/fixed-vectors-deinterleave-load.ll    |    2 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   56 +-
 .../rvv/fixed-vectors-extload-truncstore.ll   |    4 +-
 .../RISCV/rvv/fixed-vectors-extract-i1.ll     |   44 +-
 .../rvv/fixed-vectors-extract-subvector.ll    |   61 +-
 .../RISCV/rvv/fixed-vectors-extract.ll        |   36 +-
 .../fixed-vectors-fceil-constrained-sdnode.ll |   48 +-
 ...fixed-vectors-ffloor-constrained-sdnode.ll |   48 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |  168 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   84 +-
 .../RISCV/rvv/fixed-vectors-fmaximum.ll       |   56 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   84 +-
 .../RISCV/rvv/fixed-vectors-fminimum.ll       |   56 +-
 ...d-vectors-fnearbyint-constrained-sdnode.ll |   62 +-
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    |   10 +-
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |    6 +-
 .../RISCV/rvv/fixed-vectors-fp-setcc.ll       |  140 +-
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |   57 +-
 .../RISCV/rvv/fixed-vectors-fp-vrgather.ll    |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     |  738 +--
 .../RISCV/rvv/fixed-vectors-fpext-vp.ll       |    2 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll  |    4 +-
 .../RISCV/rvv/fixed-vectors-fptosi-vp.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-fptoui-vp.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-fptrunc-vp.ll     |    2 +-
 ...fixed-vectors-fround-constrained-sdnode.ll |   48 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fround.ll |   98 +-
 ...d-vectors-froundeven-constrained-sdnode.ll |   48 +-
 .../RISCV/rvv/fixed-vectors-froundeven.ll     |   98 +-
 .../rvv/fixed-vectors-insert-subvector.ll     |   35 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll |    6 +-
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   |  991 ++--
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |    6 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  117 +-
 .../RISCV/rvv/fixed-vectors-int-vrgather.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |  125 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   54 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  862 ++-
 .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll |  102 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |  244 +-
 .../RISCV/rvv/fixed-vectors-mask-buildvec.ll  |   30 +-
 .../RISCV/rvv/fixed-vectors-mask-splat.ll     |   16 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |  476 +-
 .../RISCV/rvv/fixed-vectors-masked-load-fp.ll |   20 +-
 .../rvv/fixed-vectors-masked-load-int.ll      |   14 +-
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |  636 +--
 .../rvv/fixed-vectors-masked-store-fp.ll      |   20 +-
 .../rvv/fixed-vectors-masked-store-int.ll     |   14 +-
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |  196 +-
 .../rvv/fixed-vectors-reduction-formation.ll  |   84 +-
 .../rvv/fixed-vectors-reduction-fp-vp.ll      |    4 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  422 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   72 +-
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  |  252 +-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |    8 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |  168 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |  168 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |  168 +-
 .../RISCV/rvv/fixed-vectors-select-addsub.ll  |   24 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 3842 ++++++-------
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |   63 +-
 .../RISCV/rvv/fixed-vectors-sext-vp.ll        |    2 +-
 .../fixed-vectors-shuffle-changes-length.ll   |   46 +-
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll |    4 +-
 .../rvv/fixed-vectors-shuffle-deinterleave.ll |   22 +-
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   |   41 +-
 .../rvv/fixed-vectors-shuffle-reverse.ll      |    2 +-
 .../rvv/fixed-vectors-shufflevector-vnsrl.ll  |    8 +-
 .../RISCV/rvv/fixed-vectors-sitofp-vp.ll      |    2 +-
 .../rvv/fixed-vectors-store-merge-crash.ll    |    2 +-
 .../fixed-vectors-strided-load-store-asm.ll   |   24 +-
 .../RISCV/rvv/fixed-vectors-strided-vpload.ll |   20 +-
 .../rvv/fixed-vectors-strided-vpstore.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |   90 +-
 .../RISCV/rvv/fixed-vectors-uitofp-vp.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-unaligned.ll      |   80 +-
 .../RISCV/rvv/fixed-vectors-vadd-vp.ll        |   10 +-
 .../RISCV/rvv/fixed-vectors-vcopysign-vp.ll   |   40 +-
 .../RISCV/rvv/fixed-vectors-vfabs-vp.ll       |    2 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |   80 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |   40 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |   40 +-
 .../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll    |   80 +-
 .../RISCV/rvv/fixed-vectors-vfneg-vp.ll       |    2 +-
 .../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |   10 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |   10 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |   10 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |   10 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |  162 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll |   12 +-
 .../RISCV/rvv/fixed-vectors-vpmerge.ll        |   14 +-
 .../RISCV/rvv/fixed-vectors-vpscatter.ll      |   92 +-
 .../RISCV/rvv/fixed-vectors-vpstore.ll        |    6 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |   10 +-
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      |   10 +-
 .../RISCV/rvv/fixed-vectors-vscale-range.ll   |  104 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |  157 +-
 .../RISCV/rvv/fixed-vectors-vselect.ll        |  312 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |    2 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |    2 +-
 .../RISCV/rvv/fixed-vectors-vwadd-mask.ll     |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll  |   20 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll |   20 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll  |    8 +-
 .../RISCV/rvv/fixed-vectors-vwmulsu.ll        |    6 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll |    2 +-
 .../RISCV/rvv/fixed-vectors-vwsub-mask.ll     |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll  |   44 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll |   44 +-
 .../RISCV/rvv/fixed-vectors-zext-vp.ll        |    2 +-
 .../CodeGen/RISCV/rvv/float-round-conv.ll     |   16 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |  305 +-
 .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll |  116 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |  187 +-
 .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll |  116 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |  187 +-
 .../rvv/fnearbyint-constrained-sdnode.ll      |   80 +-
 .../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll    |  164 +-
 .../RISCV/rvv/fold-scalar-load-crash.ll       |   12 +-
 .../test/CodeGen/RISCV/rvv/fold-vector-cmp.ll |    4 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  492 +-
 llvm/test/CodeGen/RISCV/rvv/frm-insert.ll     |    4 +-
 .../RISCV/rvv/fround-constrained-sdnode.ll    |   50 +-
 llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll  |  118 +-
 .../rvv/froundeven-constrained-sdnode.ll      |   50 +-
 .../CodeGen/RISCV/rvv/froundeven-sdnode.ll    |  118 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |  229 +-
 .../test/CodeGen/RISCV/rvv/half-round-conv.ll |   24 +-
 llvm/test/CodeGen/RISCV/rvv/localvar.ll       |   16 +-
 llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll  |   76 +-
 llvm/test/CodeGen/RISCV/rvv/memory-args.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll |   39 +-
 .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll |   54 +-
 .../RISCV/rvv/mutate-prior-vsetvli-avl.ll     |    4 +-
 .../RISCV/rvv/named-vector-shuffle-reverse.ll |   46 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |  411 +-
 .../CodeGen/RISCV/rvv/no-reserved-frame.ll    |   19 +-
 llvm/test/CodeGen/RISCV/rvv/pr125306.ll       |   44 +-
 llvm/test/CodeGen/RISCV/rvv/pr63596.ll        |    6 +-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll        |    4 +-
 .../CodeGen/RISCV/rvv/reg-alloc-reserve-bp.ll |    4 +-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |  305 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |  305 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |  305 +-
 .../test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll |   10 +-
 .../RISCV/rvv/rvv-peephole-vmerge-vops.ll     |    2 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  322 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |   78 +-
 llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll     |    8 +-
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  |  180 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     |   34 +-
 llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll |   12 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   20 +-
 llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll   |  176 +-
 llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll |   40 +-
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll        |    7 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   28 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |   53 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  367 +-
 .../RISCV/rvv/vector-extract-last-active.ll   |   28 +-
 .../RISCV/rvv/vector-interleave-fixed.ll      |   96 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 1088 ++--
 .../test/CodeGen/RISCV/rvv/vfma-vp-combine.ll |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        |  161 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |  145 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |   73 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  |   64 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |    7 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |    7 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |    7 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |    7 +-
 llvm/test/CodeGen/RISCV/rvv/vmseq.ll          |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsge.ll          |   20 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll         |   20 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgt.ll          |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll         |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsle.ll          |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsleu.ll         |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmslt.ll          |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsltu.ll         |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmsne.ll          |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll        |    8 +-
 .../RISCV/rvv/vp-combine-store-reverse.ll     |    3 +-
 llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll   |   14 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll |   42 +-
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   74 +-
 llvm/test/CodeGen/RISCV/rvv/vpload.ll         |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   21 +-
 .../CodeGen/RISCV/rvv/vpscatter-sdnode.ll     |   78 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   24 +-
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |    6 +-
 .../RISCV/rvv/vrgatherei16-subreg-liveness.ll |   16 +-
 .../RISCV/rvv/vscale-vw-web-simplification.ll |   56 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |  138 +-
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |   26 +-
 llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll |    4 +-
 .../CodeGen/RISCV/rvv/vsetvli-regression.ll   |    5 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   73 +-
 .../CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll   |    4 +-
 .../CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll    |    4 +-
 .../RISCV/rvv/vxrm-insert-out-of-loop.ll      |   58 +-
 llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll    |    4 +-
 .../RISCV/rvv/wrong-chain-fixed-load.ll       |    6 +-
 llvm/test/CodeGen/RISCV/scmp.ll               |   20 +-
 llvm/test/CodeGen/RISCV/select-and.ll         |   16 +-
 llvm/test/CodeGen/RISCV/select-bare.ll        |   16 +-
 llvm/test/CodeGen/RISCV/select-cc.ll          |   70 +-
 .../test/CodeGen/RISCV/select-constant-xor.ll |   16 +-
 .../CodeGen/RISCV/select-optimize-multiple.ll |   30 +-
 llvm/test/CodeGen/RISCV/select-or.ll          |   16 +-
 llvm/test/CodeGen/RISCV/sextw-removal.ll      |   24 +-
 llvm/test/CodeGen/RISCV/shift-amount-mod.ll   |   72 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |   56 +-
 llvm/test/CodeGen/RISCV/shl-cttz.ll           |   60 +-
 llvm/test/CodeGen/RISCV/split-offsets.ll      |    4 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  105 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  389 +-
 llvm/test/CodeGen/RISCV/stack-slot-size.ll    |   12 +-
 llvm/test/CodeGen/RISCV/stack-store-check.ll  |  290 +-
 llvm/test/CodeGen/RISCV/tail-calls.ll         |   68 +-
 llvm/test/CodeGen/RISCV/ucmp.ll               |   20 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |  144 +-
 .../CodeGen/RISCV/urem-seteq-illegal-types.ll |  122 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  337 +-
 llvm/test/CodeGen/RISCV/vararg.ll             |  596 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 1073 ++--
 .../RISCV/wide-scalar-shift-legalization.ll   |  520 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |    6 +-
 llvm/test/CodeGen/RISCV/xtheadmemidx.ll       |   20 +-
 llvm/test/CodeGen/RISCV/xtheadmempair.ll      |   32 +-
 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll  |   30 +-
 .../CodeGen/RISCV/zdinx-asm-constraint.ll     |   36 +-
 .../CodeGen/RISCV/zdinx-boundary-check.ll     |  156 +-
 402 files changed, 27971 insertions(+), 29148 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index b5eea138732a5..c54afa1e6e72e 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -88,21 +88,30 @@ class RISCVTuneProcessorModel<string n,
 
 defvar GenericTuneFeatures = [TuneOptimizedNF2SegmentLoadStore];
 
+// Adjust the default cost model to enable all heuristics, not just latency
+// In particular, this enables register pressure heustics which are very
+// important for high LMUL vector code, and have little negative impact
+// on other configurations,
+def GenericModel : SchedMachineModel {
+  let MicroOpBufferSize = 1;
+  let CompleteModel = 0;
+}
+
 def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
-                                       NoSchedModel,
+                                       GenericModel,
                                        [Feature32Bit,
                                         FeatureStdExtI],
                                        GenericTuneFeatures>,
                    GenericTuneInfo;
 def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
-                                       NoSchedModel,
+                                       GenericModel,
                                        [Feature64Bit,
                                         FeatureStdExtI],
                                        GenericTuneFeatures>,
                    GenericTuneInfo;
 // Support generic for compatibility with other targets. The triple will be used
 // to change to the appropriate rv32/rv64 version.
-def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo;
+def GENERIC : RISCVTuneProcessorModel<"generic", GenericModel>, GenericTuneInfo;
 
 def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
                                      MIPSP8700Model,
@@ -496,7 +505,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
                                                   TunePostRAScheduler]>;
 
 def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
-                                            NoSchedModel,
+                                            GenericModel,
                                             [Feature64Bit,
                                              FeatureStdExtI,
                                              FeatureStdExtZifencei,
@@ -556,7 +565,7 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                             TuneShiftedZExtWFusion]>;
 
 def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
-                                       NoSchedModel,
+                                       GenericModel,
                                        !listconcat(RVA22S64Features,
                                        [FeatureStdExtV,
                                         FeatureStdExtSscofpmf,
@@ -581,7 +590,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
 }
 
 def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3",
-                                         NoSchedModel,
+                                         GenericModel,
                                          [Feature32Bit,
                                           FeatureStdExtI,
                                           FeatureStdExtM,
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
index 0fd23a7d346df..1b96189aaea5c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
@@ -212,30 +212,30 @@ define i64 @add64_accept(i64 %a) nounwind {
 define void @add32_reject() nounwind {
 ; RV32I-LABEL: add32_reject:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(ga)
-; RV32I-NEXT:    lui a1, %hi(gb)
-; RV32I-NEXT:    lw a2, %lo(ga)(a0)
-; RV32I-NEXT:    lw a3, %lo(gb)(a1)
-; RV32I-NEXT:    lui a4, 1
-; RV32I-NEXT:    addi a4, a4, -1096
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    sw a2, %lo(ga)(a0)
-; RV32I-NEXT:    sw a3, %lo(gb)(a1)
+; RV32I-NEXT:    lui a0, 1
+; RV32I-NEXT:    lui a1, %hi(ga)
+; RV32I-NEXT:    lui a2, %hi(gb)
+; RV32I-NEXT:    lw a3, %lo(ga)(a1)
+; RV32I-NEXT:    lw a4, %lo(gb)(a2)
+; RV32I-NEXT:    addi a0, a0, -1096
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    add a0, a4, a0
+; RV32I-NEXT:    sw a3, %lo(ga)(a1)
+; RV32I-NEXT:    sw a0, %lo(gb)(a2)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add32_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(ga)
-; RV64I-NEXT:    lui a1, %hi(gb)
-; RV64I-NEXT:    lw a2, %lo(ga)(a0)
-; RV64I-NEXT:    lw a3, %lo(gb)(a1)
-; RV64I-NEXT:    lui a4, 1
-; RV64I-NEXT:    addi a4, a4, -1096
-; RV64I-NEXT:    add a2, a2, a4
-; RV64I-NEXT:    add a3, a3, a4
-; RV64I-NEXT:    sw a2, %lo(ga)(a0)
-; RV64I-NEXT:    sw a3, %lo(gb)(a1)
+; RV64I-NEXT:    lui a0, 1
+; RV64I-NEXT:    lui a1, %hi(ga)
+; RV64I-NEXT:    lui a2, %hi(gb)
+; RV64I-NEXT:    lw a3, %lo(ga)(a1)
+; RV64I-NEXT:    lw a4, %lo(gb)(a2)
+; RV64I-NEXT:    addi a0, a0, -1096
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    add a0, a4, a0
+; RV64I-NEXT:    sw a3, %lo(ga)(a1)
+; RV64I-NEXT:    sw a0, %lo(gb)(a2)
 ; RV64I-NEXT:    ret
   %1 = load i32, ptr @ga, align 4
   %2 = load i32, ptr @gb, align 4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
index 3a55189076dee..5b9f0e60e7d80 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
@@ -93,49 +93,49 @@ define i32 @expanded_neg_abs32_unsigned(i32 %x) {
 define i64 @expanded_neg_abs64(i64 %x) {
 ; RV32I-LABEL: expanded_neg_abs64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    neg a3, a1
-; RV32I-NEXT:    sub a2, a3, a2
-; RV32I-NEXT:    neg a3, a0
-; RV32I-NEXT:    beq a2, a1, .LBB2_2
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    neg a4, a1
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    beq a3, a1, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt a4, a1, a2
+; RV32I-NEXT:    slt a4, a1, a3
 ; RV32I-NEXT:    beqz a4, .LBB2_3
 ; RV32I-NEXT:    j .LBB2_4
 ; RV32I-NEXT:  .LBB2_2:
-; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    bnez a4, .LBB2_4
 ; RV32I-NEXT:  .LBB2_3:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB2_4:
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    neg a0, a2
+; RV32I-NEXT:    snez a1, a2
+; RV32I-NEXT:    neg a2, a3
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: expanded_neg_abs64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    neg a3, a1
-; RV32ZBB-NEXT:    sub a2, a3, a2
-; RV32ZBB-NEXT:    neg a3, a0
-; RV32ZBB-NEXT:    beq a2, a1, .LBB2_2
+; RV32ZBB-NEXT:    neg a2, a0
+; RV32ZBB-NEXT:    snez a3, a0
+; RV32ZBB-NEXT:    neg a4, a1
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    beq a3, a1, .LBB2_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt a4, a1, a2
+; RV32ZBB-NEXT:    slt a4, a1, a3
 ; RV32ZBB-NEXT:    beqz a4, .LBB2_3
 ; RV32ZBB-NEXT:    j .LBB2_4
 ; RV32ZBB-NEXT:  .LBB2_2:
-; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    bnez a4, .LBB2_4
 ; RV32ZBB-NEXT:  .LBB2_3:
-; RV32ZBB-NEXT:    mv a3, a0
-; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a2, a0
+; RV32ZBB-NEXT:    mv a3, a1
 ; RV32ZBB-NEXT:  .LBB2_4:
-; RV32ZBB-NEXT:    neg a0, a3
-; RV32ZBB-NEXT:    snez a1, a3
-; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    neg a0, a2
+; RV32ZBB-NEXT:    snez a1, a2
+; RV32ZBB-NEXT:    neg a2, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    ret
 ;
@@ -163,49 +163,49 @@ define i64 @expanded_neg_abs64(i64 %x) {
 define i64 @expanded_neg_abs64_unsigned(i64 %x) {
 ; RV32I-LABEL: expanded_neg_abs64_unsigned:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    neg a3, a1
-; RV32I-NEXT:    sub a2, a3, a2
-; RV32I-NEXT:    neg a3, a0
-; RV32I-NEXT:    beq a2, a1, .LBB3_2
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    neg a4, a1
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    beq a3, a1, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a4, a1, a2
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    beqz a4, .LBB3_3
 ; RV32I-NEXT:    j .LBB3_4
 ; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    bnez a4, .LBB3_4
 ; RV32I-NEXT:  .LBB3_3:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB3_4:
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    neg a0, a2
+; RV32I-NEXT:    snez a1, a2
+; RV32I-NEXT:    neg a2, a3
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: expanded_neg_abs64_unsigned:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    neg a3, a1
-; RV32ZBB-NEXT:    sub a2, a3, a2
-; RV32ZBB-NEXT:    neg a3, a0
-; RV32ZBB-NEXT:    beq a2, a1, .LBB3_2
+; RV32ZBB-NEXT:    neg a2, a0
+; RV32ZBB-NEXT:    snez a3, a0
+; RV32ZBB-NEXT:    neg a4, a1
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    beq a3, a1, .LBB3_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a4, a1, a2
+; RV32ZBB-NEXT:    sltu a4, a1, a3
 ; RV32ZBB-NEXT:    beqz a4, .LBB3_3
 ; RV32ZBB-NEXT:    j .LBB3_4
 ; RV32ZBB-NEXT:  .LBB3_2:
-; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    bnez a4, .LBB3_4
 ; RV32ZBB-NEXT:  .LBB3_3:
-; RV32ZBB-NEXT:    mv a3, a0
-; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a2, a0
+; RV32ZBB-NEXT:    mv a3, a1
 ; RV32ZBB-NEXT:  .LBB3_4:
-; RV32ZBB-NEXT:    neg a0, a3
-; RV32ZBB-NEXT:    snez a1, a3
-; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    neg a0, a2
+; RV32ZBB-NEXT:    snez a1, a2
+; RV32ZBB-NEXT:    neg a2, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    ret
 ;
@@ -315,49 +315,49 @@ define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
 define i64 @expanded_neg_inv_abs64(i64 %x) {
 ; RV32I-LABEL: expanded_neg_inv_abs64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    neg a3, a1
-; RV32I-NEXT:    sub a2, a3, a2
-; RV32I-NEXT:    neg a3, a0
-; RV32I-NEXT:    beq a2, a1, .LBB6_2
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    neg a4, a1
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    beq a3, a1, .LBB6_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt a4, a2, a1
+; RV32I-NEXT:    slt a4, a3, a1
 ; RV32I-NEXT:    beqz a4, .LBB6_3
 ; RV32I-NEXT:    j .LBB6_4
 ; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    sltu a4, a2, a0
 ; RV32I-NEXT:    bnez a4, .LBB6_4
 ; RV32I-NEXT:  .LBB6_3:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB6_4:
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    neg a0, a2
+; RV32I-NEXT:    snez a1, a2
+; RV32I-NEXT:    neg a2, a3
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: expanded_neg_inv_abs64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    neg a3, a1
-; RV32ZBB-NEXT:    sub a2, a3, a2
-; RV32ZBB-NEXT:    neg a3, a0
-; RV32ZBB-NEXT:    beq a2, a1, .LBB6_2
+; RV32ZBB-NEXT:    neg a2, a0
+; RV32ZBB-NEXT:    snez a3, a0
+; RV32ZBB-NEXT:    neg a4, a1
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    beq a3, a1, .LBB6_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt a4, a2, a1
+; RV32ZBB-NEXT:    slt a4, a3, a1
 ; RV32ZBB-NEXT:    beqz a4, .LBB6_3
 ; RV32ZBB-NEXT:    j .LBB6_4
 ; RV32ZBB-NEXT:  .LBB6_2:
-; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    sltu a4, a2, a0
 ; RV32ZBB-NEXT:    bnez a4, .LBB6_4
 ; RV32ZBB-NEXT:  .LBB6_3:
-; RV32ZBB-NEXT:    mv a3, a0
-; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a2, a0
+; RV32ZBB-NEXT:    mv a3, a1
 ; RV32ZBB-NEXT:  .LBB6_4:
-; RV32ZBB-NEXT:    neg a0, a3
-; RV32ZBB-NEXT:    snez a1, a3
-; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    neg a0, a2
+; RV32ZBB-NEXT:    snez a1, a2
+; RV32ZBB-NEXT:    neg a2, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    ret
 ;
@@ -385,49 +385,49 @@ define i64 @expanded_neg_inv_abs64(i64 %x) {
 define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) {
 ; RV32I-LABEL: expanded_neg_inv_abs64_unsigned:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    neg a3, a1
-; RV32I-NEXT:    sub a2, a3, a2
-; RV32I-NEXT:    neg a3, a0
-; RV32I-NEXT:    beq a2, a1, .LBB7_2
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    neg a4, a1
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    beq a3, a1, .LBB7_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    sltu a4, a3, a1
 ; RV32I-NEXT:    beqz a4, .LBB7_3
 ; RV32I-NEXT:    j .LBB7_4
 ; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    sltu a4, a2, a0
 ; RV32I-NEXT:    bnez a4, .LBB7_4
 ; RV32I-NEXT:  .LBB7_3:
-; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB7_4:
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    neg a0, a2
+; RV32I-NEXT:    snez a1, a2
+; RV32I-NEXT:    neg a2, a3
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    neg a3, a1
-; RV32ZBB-NEXT:    sub a2, a3, a2
-; RV32ZBB-NEXT:    neg a3, a0
-; RV32ZBB-NEXT:    beq a2, a1, .LBB7_2
+; RV32ZBB-NEXT:    neg a2, a0
+; RV32ZBB-NEXT:    snez a3, a0
+; RV32ZBB-NEXT:    neg a4, a1
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    beq a3, a1, .LBB7_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    sltu a4, a3, a1
 ; RV32ZBB-NEXT:    beqz a4, .LBB7_3
 ; RV32ZBB-NEXT:    j .LBB7_4
 ; RV32ZBB-NEXT:  .LBB7_2:
-; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    sltu a4, a2, a0
 ; RV32ZBB-NEXT:    bnez a4, .LBB7_4
 ; RV32ZBB-NEXT:  .LBB7_3:
-; RV32ZBB-NEXT:    mv a3, a0
-; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a2, a0
+; RV32ZBB-NEXT:    mv a3, a1
 ; RV32ZBB-NEXT:  .LBB7_4:
-; RV32ZBB-NEXT:    neg a0, a3
-; RV32ZBB-NEXT:    snez a1, a3
-; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    neg a0, a2
+; RV32ZBB-NEXT:    snez a1, a2
+; RV32ZBB-NEXT:    neg a2, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
index cb2037f5fb027..28dde9a3472c2 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll
@@ -424,11 +424,11 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s2, a2
 ; RV32I-NEXT:    mv s3, a3
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32I-NEXT:    addi a1, a1, %lo(.LCPI12_0)
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    lui a2, %hi(.LCPI12_0)
+; RV32I-NEXT:    addi a3, a2, %lo(.LCPI12_0)
+; RV32I-NEXT:    lw a2, 0(a3)
+; RV32I-NEXT:    lw a3, 4(a3)
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv a4, a0
 ; RV32I-NEXT:    lui a5, 524288
@@ -454,9 +454,9 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI12_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI12_0)(a1)
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -511,20 +511,20 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32I-NEXT:    addi a2, a2, %lo(.LCPI13_0)
-; RV32I-NEXT:    lw s3, 0(a2)
-; RV32I-NEXT:    lw s4, 4(a2)
-; RV32I-NEXT:    mv s5, a5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    lw s4, 0(a2)
+; RV32I-NEXT:    lw s5, 4(a2)
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    mv s7, a1
 ; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv a4, a0
 ; RV32I-NEXT:    lui a5, 524288
@@ -556,14 +556,14 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI13_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    ld s2, %lo(.LCPI13_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a2, a1, 63
@@ -625,20 +625,20 @@ define double @fnmadd_d_2(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI14_0)
 ; RV32I-NEXT:    addi a2, a2, %lo(.LCPI14_0)
-; RV32I-NEXT:    lw s3, 0(a2)
-; RV32I-NEXT:    lw s4, 4(a2)
-; RV32I-NEXT:    mv s5, a5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    lw s4, 0(a2)
+; RV32I-NEXT:    lw s5, 4(a2)
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    mv s7, a1
 ; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv a4, a0
 ; RV32I-NEXT:    lui a5, 524288
@@ -670,14 +670,14 @@ define double @fnmadd_d_2(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI14_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    ld s2, %lo(.LCPI14_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a2, a1, 63
@@ -799,11 +799,11 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32I-NEXT:    addi a3, a2, %lo(.LCPI17_0)
 ; RV32I-NEXT:    lw a2, 0(a3)
 ; RV32I-NEXT:    lw a3, 4(a3)
-; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    lui a2, 524288
 ; RV32I-NEXT:    xor a1, a1, a2
@@ -827,9 +827,9 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV64I-NEXT:    ld a1, %lo(.LCPI17_0)(a1)
-; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -880,11 +880,11 @@ define double @fnmsub_d_2(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI18_0)
 ; RV32I-NEXT:    addi a3, a2, %lo(.LCPI18_0)
 ; RV32I-NEXT:    lw a2, 0(a3)
 ; RV32I-NEXT:    lw a3, 4(a3)
-; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    lui a3, 524288
@@ -910,9 +910,9 @@ define double @fnmsub_d_2(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI18_0)
 ; RV64I-NEXT:    ld a1, %lo(.LCPI18_0)(a1)
-; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    li a1, -1
 ; RV64I-NEXT:    slli a1, a1, 63
@@ -1009,11 +1009,11 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s2, a2
 ; RV32I-NEXT:    mv s3, a3
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    lui a1, %hi(.LCPI20_0)
-; RV32I-NEXT:    addi a1, a1, %lo(.LCPI20_0)
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    lui a2, %hi(.LCPI20_0)
+; RV32I-NEXT:    addi a3, a2, %lo(.LCPI20_0)
+; RV32I-NEXT:    lw a2, 0(a3)
+; RV32I-NEXT:    lw a3, 4(a3)
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
@@ -1044,9 +1044,9 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI20_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI20_0)(a1)
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
@@ -1108,27 +1108,27 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32I-NEXT:    addi a2, a2, %lo(.LCPI21_0)
-; RV32I-NEXT:    lw s3, 0(a2)
-; RV32I-NEXT:    lw s4, 4(a2)
-; RV32I-NEXT:    mv s5, a5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    lw s4, 0(a2)
+; RV32I-NEXT:    lw s5, 4(a2)
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    mv s7, a1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
@@ -1163,18 +1163,18 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI21_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    ld s2, %lo(.LCPI21_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    mv a0, s3
@@ -1237,20 +1237,20 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a3
 ; RV32I-NEXT:    mv s2, a4
+; RV32I-NEXT:    mv s3, a5
 ; RV32I-NEXT:    lui a2, %hi(.LCPI22_0)
 ; RV32I-NEXT:    addi a2, a2, %lo(.LCPI22_0)
-; RV32I-NEXT:    lw s3, 0(a2)
-; RV32I-NEXT:    lw s4, 4(a2)
-; RV32I-NEXT:    mv s5, a5
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    lw s4, 0(a2)
+; RV32I-NEXT:    lw s5, 4(a2)
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    mv s7, a1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    mv a2, s3
-; RV32I-NEXT:    mv a3, s4
+; RV32I-NEXT:    mv a2, s4
+; RV32I-NEXT:    mv a3, s5
 ; RV32I-NEXT:    call __adddf3
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    mv a3, a1
@@ -1260,7 +1260,7 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __subdf3
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1283,20 +1283,20 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV64I-NEXT:    ld s1, %lo(.LCPI22_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    ld s2, %lo(.LCPI22_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __adddf3
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s3
 ; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __subdf3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
index fdeda0c273f6d..676f0f5ec3eb8 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll
@@ -414,9 +414,9 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI12_0)(a0)
 ; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32I-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
 ; RV32I-NEXT:    xor a2, a0, a2
@@ -437,9 +437,9 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI12_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV64I-NEXT:    lw a1, %lo(.LCPI12_0)(a1)
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
 ; RV64I-NEXT:    xor a2, a0, a2
@@ -475,14 +475,14 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI13_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    lw s2, %lo(.LCPI13_0)(a1)
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
 ; RV32I-NEXT:    xor a1, s3, a2
@@ -507,14 +507,14 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI13_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI13_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    lw s2, %lo(.LCPI13_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
 ; RV64I-NEXT:    xor a1, s3, a2
@@ -556,14 +556,14 @@ define float @fnmadd_s_2(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI14_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    lw s2, %lo(.LCPI14_0)(a1)
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a2, 524288
 ; RV32I-NEXT:    xor a1, s3, a2
@@ -588,14 +588,14 @@ define float @fnmadd_s_2(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI14_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    lw s2, %lo(.LCPI14_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s3, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a2, 524288
 ; RV64I-NEXT:    xor a1, s3, a2
@@ -720,9 +720,9 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV32I-NEXT:    lw a1, %lo(.LCPI17_0)(a1)
-; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    xor a0, a0, a1
@@ -742,9 +742,9 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV64I-NEXT:    lw a1, %lo(.LCPI17_0)(a1)
-; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a1, 524288
 ; RV64I-NEXT:    xor a0, a0, a1
@@ -778,9 +778,9 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI18_0)
 ; RV32I-NEXT:    lw a1, %lo(.LCPI18_0)(a1)
-; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    lui a1, 524288
 ; RV32I-NEXT:    xor a1, a0, a1
@@ -801,9 +801,9 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI18_0)
 ; RV64I-NEXT:    lw a1, %lo(.LCPI18_0)(a1)
-; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    lui a1, 524288
 ; RV64I-NEXT:    xor a1, a0, a1
@@ -877,9 +877,9 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV32I-NEXT:    lw a1, %lo(.LCPI20_0)(a0)
 ; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    lui a1, %hi(.LCPI20_0)
+; RV32I-NEXT:    lw a1, %lo(.LCPI20_0)(a1)
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s0
@@ -903,9 +903,9 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV64I-NEXT:    lw a1, %lo(.LCPI20_0)(a0)
 ; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_0)
+; RV64I-NEXT:    lw a1, %lo(.LCPI20_0)(a1)
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
@@ -946,18 +946,18 @@ define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI21_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    lw s2, %lo(.LCPI21_0)(a1)
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    mv a0, s3
@@ -984,18 +984,18 @@ define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI21_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI21_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    lw s2, %lo(.LCPI21_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    mv a0, s3
@@ -1039,20 +1039,20 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV32I-NEXT:    lw s1, %lo(.LCPI22_0)(a1)
-; RV32I-NEXT:    mv s2, a2
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    lw s2, %lo(.LCPI22_0)(a1)
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    call __mulsf3
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __subsf3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1071,20 +1071,20 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a2
 ; RV64I-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV64I-NEXT:    lw s1, %lo(.LCPI22_0)(a1)
-; RV64I-NEXT:    mv s2, a2
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    lw s2, %lo(.LCPI22_0)(a1)
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
+; RV64I-NEXT:    mv a1, s2
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s3
 ; RV64I-NEXT:    call __mulsf3
 ; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __subsf3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
index 234f338412066..36ff827ebf32a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
@@ -142,20 +142,20 @@ define i32 @freeze_anonstruct(ptr %p) {
 define i32 @freeze_anonstruct2(ptr %p) {
 ; RV32-LABEL: freeze_anonstruct2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lh a1, 4(a0)
-; RV32-NEXT:    lw a0, 0(a0)
-; RV32-NEXT:    slli a1, a1, 16
-; RV32-NEXT:    srli a1, a1, 16
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lh a0, 4(a0)
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: freeze_anonstruct2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lh a1, 4(a0)
-; RV64-NEXT:    lw a0, 0(a0)
-; RV64-NEXT:    slli a1, a1, 48
-; RV64-NEXT:    srli a1, a1, 48
-; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lh a0, 4(a0)
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -169,20 +169,20 @@ define i32 @freeze_anonstruct2(ptr %p) {
 define i32 @freeze_anonstruct2_sext(ptr %p) {
 ; RV32-LABEL: freeze_anonstruct2_sext:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lh a1, 4(a0)
-; RV32-NEXT:    lw a0, 0(a0)
-; RV32-NEXT:    slli a1, a1, 16
-; RV32-NEXT:    srai a1, a1, 16
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lh a0, 4(a0)
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srai a0, a0, 16
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: freeze_anonstruct2_sext:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lh a1, 4(a0)
-; RV64-NEXT:    lw a0, 0(a0)
-; RV64-NEXT:    slli a1, a1, 48
-; RV64-NEXT:    srai a1, a1, 48
-; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lh a0, 4(a0)
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srai a0, a0, 48
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
index 8a786fc9993d2..6e13179bfe77e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -782,8 +782,8 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
 define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: rotl_64_mask:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a5, 32
 ; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    li a5, 32
 ; RV32I-NEXT:    bltu a2, a5, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li a3, 0
@@ -837,8 +837,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32ZBB-LABEL: rotl_64_mask:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    li a5, 32
 ; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    li a5, 32
 ; RV32ZBB-NEXT:    bltu a2, a5, .LBB10_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    li a3, 0
@@ -892,8 +892,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: rotl_64_mask:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    li a5, 32
 ; RV32XTHEADBB-NEXT:    neg a4, a2
+; RV32XTHEADBB-NEXT:    li a5, 32
 ; RV32XTHEADBB-NEXT:    bltu a2, a5, .LBB10_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    li a3, 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9a6c718703a27..e1019c63408ee 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -68,8 +68,8 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    li s0, 31
+; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    beqz a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index 558424b53be95..12afb3adf2f69 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -115,8 +115,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
 ; RV64I-LABEL: pack_i64_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lwu a0, 0(a0)
-; RV64I-NEXT:    lwu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    lwu a1, 0(a1)
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
index 8b262db56ccd2..8bffb0772eeef 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -503,9 +503,9 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: shl128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a7, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    li a6, 64
 ; RV32I-NEXT:    li t1, 32
 ; RV32I-NEXT:    neg t5, a2
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll
index caa749729ce19..11912483f8d9c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll
@@ -17,10 +17,10 @@ define void @test_scoped_alloca(i64 %n) {
 ; RV32-NEXT:    .cfi_offset s1, -12
 ; RV32-NEXT:    addi s0, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    mv s1, sp
 ; RV32-NEXT:    addi a0, a0, 15
 ; RV32-NEXT:    andi a0, a0, -16
 ; RV32-NEXT:    sub a0, sp, a0
-; RV32-NEXT:    mv s1, sp
 ; RV32-NEXT:    mv sp, a0
 ; RV32-NEXT:    call use_addr
 ; RV32-NEXT:    mv sp, s1
@@ -48,10 +48,10 @@ define void @test_scoped_alloca(i64 %n) {
 ; RV64-NEXT:    .cfi_offset s1, -24
 ; RV64-NEXT:    addi s0, sp, 32
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    mv s1, sp
 ; RV64-NEXT:    addi a0, a0, 15
 ; RV64-NEXT:    andi a0, a0, -16
 ; RV64-NEXT:    sub a0, sp, a0
-; RV64-NEXT:    mv s1, sp
 ; RV64-NEXT:    mv sp, a0
 ; RV64-NEXT:    call use_addr
 ; RV64-NEXT:    mv sp, s1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index fc9be94988451..ba67b45ebbe7d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -49,12 +49,12 @@ define i32 @va1(ptr %fmt, ...) {
 ; RV32-NEXT:    sw a2, 24(sp)
 ; RV32-NEXT:    sw a3, 28(sp)
 ; RV32-NEXT:    sw a4, 32(sp)
-; RV32-NEXT:    addi a0, sp, 20
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    lw a0, 12(sp)
 ; RV32-NEXT:    sw a5, 36(sp)
 ; RV32-NEXT:    sw a6, 40(sp)
 ; RV32-NEXT:    sw a7, 44(sp)
+; RV32-NEXT:    addi a0, sp, 20
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    lw a0, 12(sp)
 ; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lw a0, 0(a0)
@@ -103,12 +103,12 @@ define i32 @va1(ptr %fmt, ...) {
 ; RV32-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 16(s0)
-; RV32-WITHFP-NEXT:    addi a0, s0, 4
-; RV32-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
+; RV32-WITHFP-NEXT:    addi a0, s0, 4
+; RV32-WITHFP-NEXT:    sw a0, -12(s0)
+; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    addi a1, a0, 4
 ; RV32-WITHFP-NEXT:    sw a1, -12(s0)
 ; RV32-WITHFP-NEXT:    lw a0, 0(a0)
@@ -517,12 +517,12 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-NEXT:    sw a2, 24(sp)
 ; ILP32-NEXT:    sw a3, 28(sp)
 ; ILP32-NEXT:    sw a4, 32(sp)
-; ILP32-NEXT:    addi a0, sp, 20
-; ILP32-NEXT:    sw a0, 12(sp)
-; ILP32-NEXT:    lw a0, 12(sp)
 ; ILP32-NEXT:    sw a5, 36(sp)
 ; ILP32-NEXT:    sw a6, 40(sp)
 ; ILP32-NEXT:    sw a7, 44(sp)
+; ILP32-NEXT:    addi a0, sp, 20
+; ILP32-NEXT:    sw a0, 12(sp)
+; ILP32-NEXT:    lw a0, 12(sp)
 ; ILP32-NEXT:    addi a1, a0, 7
 ; ILP32-NEXT:    addi a0, a0, 15
 ; ILP32-NEXT:    andi a1, a1, -8
@@ -635,12 +635,12 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 16(s0)
-; RV32-WITHFP-NEXT:    addi a0, s0, 4
-; RV32-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
+; RV32-WITHFP-NEXT:    addi a0, s0, 4
+; RV32-WITHFP-NEXT:    sw a0, -12(s0)
+; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    addi a1, a0, 7
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
 ; RV32-WITHFP-NEXT:    andi a1, a1, -8
@@ -854,14 +854,14 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-LABEL: va3:
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    addi sp, sp, -32
-; ILP32-NEXT:    addi a0, sp, 12
-; ILP32-NEXT:    sw a0, 4(sp)
-; ILP32-NEXT:    lw a0, 4(sp)
 ; ILP32-NEXT:    sw a3, 12(sp)
 ; ILP32-NEXT:    sw a4, 16(sp)
 ; ILP32-NEXT:    sw a5, 20(sp)
 ; ILP32-NEXT:    sw a6, 24(sp)
 ; ILP32-NEXT:    sw a7, 28(sp)
+; ILP32-NEXT:    addi a0, sp, 12
+; ILP32-NEXT:    sw a0, 4(sp)
+; ILP32-NEXT:    lw a0, 4(sp)
 ; ILP32-NEXT:    addi a3, a0, 7
 ; ILP32-NEXT:    addi a0, a0, 15
 ; ILP32-NEXT:    andi a3, a3, -8
@@ -956,13 +956,13 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-LABEL: va3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    sd a0, 8(sp)
-; RV64-NEXT:    ld a0, 8(sp)
 ; RV64-NEXT:    sd a2, 16(sp)
 ; RV64-NEXT:    sd a3, 24(sp)
 ; RV64-NEXT:    sd a4, 32(sp)
 ; RV64-NEXT:    sd a5, 40(sp)
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    sd a0, 8(sp)
+; RV64-NEXT:    ld a0, 8(sp)
 ; RV64-NEXT:    sd a6, 48(sp)
 ; RV64-NEXT:    sd a7, 56(sp)
 ; RV64-NEXT:    addi a2, a0, 7
@@ -980,14 +980,14 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw ra, 20(sp) # 4-byte Folded Spill
 ; RV32-WITHFP-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
 ; RV32-WITHFP-NEXT:    addi s0, sp, 24
-; RV32-WITHFP-NEXT:    addi a0, s0, 4
-; RV32-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 4(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a5, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 16(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 20(s0)
+; RV32-WITHFP-NEXT:    addi a0, s0, 4
+; RV32-WITHFP-NEXT:    sw a0, -12(s0)
+; RV32-WITHFP-NEXT:    lw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    addi a3, a0, 7
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
 ; RV32-WITHFP-NEXT:    andi a3, a3, -8
@@ -1009,13 +1009,13 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    addi s0, sp, 32
-; RV64-WITHFP-NEXT:    mv a0, s0
-; RV64-WITHFP-NEXT:    sd a0, -24(s0)
-; RV64-WITHFP-NEXT:    ld a0, -24(s0)
 ; RV64-WITHFP-NEXT:    sd a2, 0(s0)
 ; RV64-WITHFP-NEXT:    sd a3, 8(s0)
 ; RV64-WITHFP-NEXT:    sd a4, 16(s0)
 ; RV64-WITHFP-NEXT:    sd a5, 24(s0)
+; RV64-WITHFP-NEXT:    mv a0, s0
+; RV64-WITHFP-NEXT:    sd a0, -24(s0)
+; RV64-WITHFP-NEXT:    ld a0, -24(s0)
 ; RV64-WITHFP-NEXT:    sd a6, 32(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 40(s0)
 ; RV64-WITHFP-NEXT:    addi a2, a0, 7
@@ -1233,14 +1233,14 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-NEXT:    addi a0, sp, 36
 ; RV32-NEXT:    sw a0, 16(sp)
 ; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    addi a0, a0, 3
 ; RV32-NEXT:    li s0, -4
+; RV32-NEXT:    addi a0, a0, 3
 ; RV32-NEXT:    and a0, a0, s0
 ; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    lw a1, 16(sp)
 ; RV32-NEXT:    lw s1, 0(a0)
-; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    lw a0, 16(sp)
+; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    lw a0, 12(sp)
 ; RV32-NEXT:    call notdead
 ; RV32-NEXT:    lw a0, 16(sp)
@@ -1254,8 +1254,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-NEXT:    and a1, a1, s0
 ; RV32-NEXT:    addi a2, a1, 4
 ; RV32-NEXT:    sw a2, 16(sp)
-; RV32-NEXT:    lw a2, 16(sp)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 16(sp)
 ; RV32-NEXT:    addi a2, a2, 3
 ; RV32-NEXT:    andi a2, a2, -4
 ; RV32-NEXT:    addi a3, a2, 4
@@ -1286,18 +1286,18 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-NEXT:    addi a0, sp, 56
 ; RV64-NEXT:    sd a0, 16(sp)
 ; RV64-NEXT:    ld a0, 16(sp)
-; RV64-NEXT:    addi a0, a0, 7
 ; RV64-NEXT:    li s0, -8
+; RV64-NEXT:    addi a0, a0, 7
 ; RV64-NEXT:    and a0, a0, s0
 ; RV64-NEXT:    addi a1, a0, 8
 ; RV64-NEXT:    sd a1, 16(sp)
 ; RV64-NEXT:    ld a1, 16(sp)
 ; RV64-NEXT:    ld s1, 0(a0)
 ; RV64-NEXT:    sd a1, 8(sp)
-; RV64-NEXT:    lw a0, 12(sp)
-; RV64-NEXT:    lwu a1, 8(sp)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    lwu a0, 8(sp)
+; RV64-NEXT:    lw a1, 12(sp)
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    call notdead
 ; RV64-NEXT:    ld a0, 16(sp)
 ; RV64-NEXT:    addi a0, a0, 7
@@ -1310,8 +1310,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-NEXT:    and a1, a1, s0
 ; RV64-NEXT:    addi a2, a1, 8
 ; RV64-NEXT:    sd a2, 16(sp)
-; RV64-NEXT:    ld a2, 16(sp)
 ; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    ld a2, 16(sp)
 ; RV64-NEXT:    addi a2, a2, 7
 ; RV64-NEXT:    andi a2, a2, -8
 ; RV64-NEXT:    addi a3, a2, 8
@@ -1344,14 +1344,14 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-WITHFP-NEXT:    addi a0, s0, 4
 ; RV32-WITHFP-NEXT:    sw a0, -20(s0)
 ; RV32-WITHFP-NEXT:    lw a0, -20(s0)
-; RV32-WITHFP-NEXT:    addi a0, a0, 3
 ; RV32-WITHFP-NEXT:    li s1, -4
+; RV32-WITHFP-NEXT:    addi a0, a0, 3
 ; RV32-WITHFP-NEXT:    and a0, a0, s1
 ; RV32-WITHFP-NEXT:    addi a1, a0, 4
 ; RV32-WITHFP-NEXT:    sw a1, -20(s0)
-; RV32-WITHFP-NEXT:    lw a1, -20(s0)
 ; RV32-WITHFP-NEXT:    lw s2, 0(a0)
-; RV32-WITHFP-NEXT:    sw a1, -24(s0)
+; RV32-WITHFP-NEXT:    lw a0, -20(s0)
+; RV32-WITHFP-NEXT:    sw a0, -24(s0)
 ; RV32-WITHFP-NEXT:    lw a0, -24(s0)
 ; RV32-WITHFP-NEXT:    call notdead
 ; RV32-WITHFP-NEXT:    lw a0, -20(s0)
@@ -1365,8 +1365,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32-WITHFP-NEXT:    and a1, a1, s1
 ; RV32-WITHFP-NEXT:    addi a2, a1, 4
 ; RV32-WITHFP-NEXT:    sw a2, -20(s0)
-; RV32-WITHFP-NEXT:    lw a2, -20(s0)
 ; RV32-WITHFP-NEXT:    lw a1, 0(a1)
+; RV32-WITHFP-NEXT:    lw a2, -20(s0)
 ; RV32-WITHFP-NEXT:    addi a2, a2, 3
 ; RV32-WITHFP-NEXT:    andi a2, a2, -4
 ; RV32-WITHFP-NEXT:    addi a3, a2, 4
@@ -1400,18 +1400,18 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-WITHFP-NEXT:    addi a0, s0, 8
 ; RV64-WITHFP-NEXT:    sd a0, -40(s0)
 ; RV64-WITHFP-NEXT:    ld a0, -40(s0)
-; RV64-WITHFP-NEXT:    addi a0, a0, 7
 ; RV64-WITHFP-NEXT:    li s1, -8
+; RV64-WITHFP-NEXT:    addi a0, a0, 7
 ; RV64-WITHFP-NEXT:    and a0, a0, s1
 ; RV64-WITHFP-NEXT:    addi a1, a0, 8
 ; RV64-WITHFP-NEXT:    sd a1, -40(s0)
 ; RV64-WITHFP-NEXT:    ld a1, -40(s0)
 ; RV64-WITHFP-NEXT:    ld s2, 0(a0)
 ; RV64-WITHFP-NEXT:    sd a1, -48(s0)
-; RV64-WITHFP-NEXT:    lw a0, -44(s0)
-; RV64-WITHFP-NEXT:    lwu a1, -48(s0)
-; RV64-WITHFP-NEXT:    slli a0, a0, 32
-; RV64-WITHFP-NEXT:    or a0, a0, a1
+; RV64-WITHFP-NEXT:    lwu a0, -48(s0)
+; RV64-WITHFP-NEXT:    lw a1, -44(s0)
+; RV64-WITHFP-NEXT:    slli a1, a1, 32
+; RV64-WITHFP-NEXT:    or a0, a1, a0
 ; RV64-WITHFP-NEXT:    call notdead
 ; RV64-WITHFP-NEXT:    ld a0, -40(s0)
 ; RV64-WITHFP-NEXT:    addi a0, a0, 7
@@ -1424,8 +1424,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV64-WITHFP-NEXT:    and a1, a1, s1
 ; RV64-WITHFP-NEXT:    addi a2, a1, 8
 ; RV64-WITHFP-NEXT:    sd a2, -40(s0)
-; RV64-WITHFP-NEXT:    ld a2, -40(s0)
 ; RV64-WITHFP-NEXT:    ld a1, 0(a1)
+; RV64-WITHFP-NEXT:    ld a2, -40(s0)
 ; RV64-WITHFP-NEXT:    addi a2, a2, 7
 ; RV64-WITHFP-NEXT:    andi a2, a2, -8
 ; RV64-WITHFP-NEXT:    addi a3, a2, 8
@@ -1593,19 +1593,19 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    sw a4, 288(a0)
 ; RV32-NEXT:    lui a0, 24414
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    sw a5, 292(a0)
+; RV32-NEXT:    lui a0, 24414
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    sw a6, 296(a0)
+; RV32-NEXT:    lui a0, 24414
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    sw a7, 300(a0)
+; RV32-NEXT:    lui a0, 24414
 ; RV32-NEXT:    addi a0, a0, 276
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    lw a0, 12(sp)
-; RV32-NEXT:    lui a1, 24414
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    sw a5, 292(a1)
-; RV32-NEXT:    lui a1, 24414
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    sw a6, 296(a1)
-; RV32-NEXT:    lui a1, 24414
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    sw a7, 300(a1)
 ; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lw a0, 0(a0)
@@ -1682,12 +1682,12 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV32-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 16(s0)
-; RV32-WITHFP-NEXT:    addi a1, s0, 4
-; RV32-WITHFP-NEXT:    sw a1, 0(a0)
-; RV32-WITHFP-NEXT:    lw a1, 0(a0)
 ; RV32-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
+; RV32-WITHFP-NEXT:    addi a1, s0, 4
+; RV32-WITHFP-NEXT:    sw a1, 0(a0)
+; RV32-WITHFP-NEXT:    lw a1, 0(a0)
 ; RV32-WITHFP-NEXT:    addi a2, a1, 4
 ; RV32-WITHFP-NEXT:    sw a2, 0(a0)
 ; RV32-WITHFP-NEXT:    lw a0, 0(a1)
@@ -1869,12 +1869,12 @@ define i32 @va_printf(ptr %fmt, ...) {
 ; RV32-NEXT:    sw a2, 24(sp)
 ; RV32-NEXT:    sw a3, 28(sp)
 ; RV32-NEXT:    sw a4, 32(sp)
-; RV32-NEXT:    addi a1, sp, 20
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    lw a1, 8(sp)
 ; RV32-NEXT:    sw a5, 36(sp)
 ; RV32-NEXT:    sw a6, 40(sp)
 ; RV32-NEXT:    sw a7, 44(sp)
+; RV32-NEXT:    addi a1, sp, 20
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lw a1, 8(sp)
 ; RV32-NEXT:    call va_vprintf
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
@@ -1892,12 +1892,12 @@ define i32 @va_printf(ptr %fmt, ...) {
 ; RV64-NEXT:    sd a2, 32(sp)
 ; RV64-NEXT:    sd a3, 40(sp)
 ; RV64-NEXT:    sd a4, 48(sp)
-; RV64-NEXT:    addi a1, sp, 24
-; RV64-NEXT:    sd a1, 0(sp)
-; RV64-NEXT:    ld a1, 0(sp)
 ; RV64-NEXT:    sd a5, 56(sp)
 ; RV64-NEXT:    sd a6, 64(sp)
 ; RV64-NEXT:    sd a7, 72(sp)
+; RV64-NEXT:    addi a1, sp, 24
+; RV64-NEXT:    sd a1, 0(sp)
+; RV64-NEXT:    ld a1, 0(sp)
 ; RV64-NEXT:    call va_vprintf
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
@@ -1919,12 +1919,12 @@ define i32 @va_printf(ptr %fmt, ...) {
 ; RV32-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 16(s0)
-; RV32-WITHFP-NEXT:    addi a1, s0, 4
-; RV32-WITHFP-NEXT:    sw a1, -12(s0)
-; RV32-WITHFP-NEXT:    lw a1, -12(s0)
 ; RV32-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
+; RV32-WITHFP-NEXT:    addi a1, s0, 4
+; RV32-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32-WITHFP-NEXT:    lw a1, -12(s0)
 ; RV32-WITHFP-NEXT:    call va_vprintf
 ; RV32-WITHFP-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1949,12 +1949,12 @@ define i32 @va_printf(ptr %fmt, ...) {
 ; RV64-WITHFP-NEXT:    sd a2, 16(s0)
 ; RV64-WITHFP-NEXT:    sd a3, 24(s0)
 ; RV64-WITHFP-NEXT:    sd a4, 32(s0)
-; RV64-WITHFP-NEXT:    addi a1, s0, 8
-; RV64-WITHFP-NEXT:    sd a1, -24(s0)
-; RV64-WITHFP-NEXT:    ld a1, -24(s0)
 ; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
+; RV64-WITHFP-NEXT:    addi a1, s0, 8
+; RV64-WITHFP-NEXT:    sd a1, -24(s0)
+; RV64-WITHFP-NEXT:    ld a1, -24(s0)
 ; RV64-WITHFP-NEXT:    call va_vprintf
 ; RV64-WITHFP-NEXT:    .cfi_def_cfa sp, 96
 ; RV64-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
index bc002fee4417c..47c17d615e0f2 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -5,22 +5,22 @@
 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a0, a0, 16
 ; RV64I-NEXT:    slli a1, a1, 16
 ; RV64I-NEXT:    or a0, a0, a3
@@ -40,22 +40,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or a0, a0, a3
@@ -82,22 +82,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a0, a0, 16
 ; RV64I-NEXT:    slli a1, a1, 16
 ; RV64I-NEXT:    or a0, a0, a3
@@ -117,22 +117,22 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or a0, a0, a3
@@ -159,22 +159,22 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a0, a0, 16
 ; RV64I-NEXT:    slli a1, a1, 16
 ; RV64I-NEXT:    or a0, a0, a3
@@ -194,22 +194,22 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or a0, a0, a3
@@ -247,38 +247,38 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu t3, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a0, a0, t1
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu t0, 5(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t3, t3, 8
-; RV64I-NEXT:    or t2, t3, t2
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lui a4, 16
 ; RV64I-NEXT:    addi a4, a4, -1
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a5, t2, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -310,54 +310,54 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a6, a0, t0
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    lbu a7, 1(a1)
 ; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or t1, a0, t1
 ; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, a7, a0
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    slli a0, a4, 16
+; RV32I-NEXT:    slli a0, a5, 16
 ; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    slli a3, t1, 16
-; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    or a1, a6, a4
+; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    slli a3, a3, 3
 ; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    bltu a1, a4, .LBB3_2
+; RV32I-NEXT:    bltu a3, a4, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srl a5, a3, a1
-; RV32I-NEXT:    bnez a1, .LBB3_3
+; RV32I-NEXT:    srl a5, a1, a3
+; RV32I-NEXT:    bnez a3, .LBB3_3
 ; RV32I-NEXT:    j .LBB3_4
 ; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    srl a5, a0, a1
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    sll a6, a3, a6
+; RV32I-NEXT:    srl a5, a0, a3
+; RV32I-NEXT:    neg a6, a3
+; RV32I-NEXT:    sll a6, a1, a6
 ; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    beqz a1, .LBB3_4
+; RV32I-NEXT:    beqz a3, .LBB3_4
 ; RV32I-NEXT:  .LBB3_3:
 ; RV32I-NEXT:    mv a0, a5
 ; RV32I-NEXT:  .LBB3_4:
-; RV32I-NEXT:    bltu a1, a4, .LBB3_6
+; RV32I-NEXT:    bltu a3, a4, .LBB3_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    j .LBB3_7
 ; RV32I-NEXT:  .LBB3_6:
-; RV32I-NEXT:    srl a1, a3, a1
+; RV32I-NEXT:    srl a1, a1, a3
 ; RV32I-NEXT:  .LBB3_7:
 ; RV32I-NEXT:    srli a3, a0, 16
 ; RV32I-NEXT:    lui a4, 16
@@ -398,38 +398,38 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu t3, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a0, a0, t1
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu t0, 5(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t3, t3, 8
-; RV64I-NEXT:    or t2, t3, t2
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lui a4, 16
 ; RV64I-NEXT:    addi a4, a4, -1
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a5, t2, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -461,34 +461,34 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, a7, a4
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    lbu a4, 0(a1)
 ; RV32I-NEXT:    lbu a7, 1(a1)
 ; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, t1
 ; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, a7, a4
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a4, a5, 16
 ; RV32I-NEXT:    or a4, a4, a3
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    or a3, a1, a6
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a3, a1, a7
 ; RV32I-NEXT:    slli a3, a3, 3
 ; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    bltu a3, a1, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li a1, 0
@@ -544,38 +544,38 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu t3, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a0, a0, t1
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu t0, 5(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t3, t3, 8
-; RV64I-NEXT:    or t2, t3, t2
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lui a4, 16
 ; RV64I-NEXT:    addi a4, a4, -1
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a5, t2, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -607,54 +607,54 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a6, a0, t0
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    lbu a7, 1(a1)
 ; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or t1, a0, t1
 ; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, a7, a0
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    slli a0, a4, 16
+; RV32I-NEXT:    slli a0, a5, 16
 ; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    slli a3, t1, 16
-; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    or a1, a6, a4
+; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    slli a3, a3, 3
 ; RV32I-NEXT:    li a4, 32
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    bltu a1, a4, .LBB5_2
+; RV32I-NEXT:    bltu a3, a4, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sra a5, a3, a1
-; RV32I-NEXT:    bnez a1, .LBB5_3
+; RV32I-NEXT:    sra a5, a1, a3
+; RV32I-NEXT:    bnez a3, .LBB5_3
 ; RV32I-NEXT:    j .LBB5_4
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    srl a5, a0, a1
-; RV32I-NEXT:    neg a6, a1
-; RV32I-NEXT:    sll a6, a3, a6
+; RV32I-NEXT:    srl a5, a0, a3
+; RV32I-NEXT:    neg a6, a3
+; RV32I-NEXT:    sll a6, a1, a6
 ; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    beqz a1, .LBB5_4
+; RV32I-NEXT:    beqz a3, .LBB5_4
 ; RV32I-NEXT:  .LBB5_3:
 ; RV32I-NEXT:    mv a0, a5
 ; RV32I-NEXT:  .LBB5_4:
-; RV32I-NEXT:    bltu a1, a4, .LBB5_6
+; RV32I-NEXT:    bltu a3, a4, .LBB5_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    srai a1, a3, 31
+; RV32I-NEXT:    srai a1, a1, 31
 ; RV32I-NEXT:    j .LBB5_7
 ; RV32I-NEXT:  .LBB5_6:
-; RV32I-NEXT:    sra a1, a3, a1
+; RV32I-NEXT:    sra a1, a1, a3
 ; RV32I-NEXT:  .LBB5_7:
 ; RV32I-NEXT:    srli a3, a0, 16
 ; RV32I-NEXT:    lui a4, 16
@@ -686,8 +686,6 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -702,81 +700,81 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a0, 32
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    slli t0, a1, 32
 ; RV64I-NEXT:    or a0, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a3, t0, a6
+; RV64I-NEXT:    slli a3, a3, 3
 ; RV64I-NEXT:    li a4, 64
-; RV64I-NEXT:    or a3, a6, a7
-; RV64I-NEXT:    bltu a1, a4, .LBB6_2
+; RV64I-NEXT:    bltu a3, a4, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
-; RV64I-NEXT:    srl a5, a3, a5
-; RV64I-NEXT:    bnez a1, .LBB6_3
+; RV64I-NEXT:    subw a5, a3, a4
+; RV64I-NEXT:    srl a5, a1, a5
+; RV64I-NEXT:    bnez a3, .LBB6_3
 ; RV64I-NEXT:    j .LBB6_4
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
-; RV64I-NEXT:    sll a6, a3, a6
+; RV64I-NEXT:    srl a5, a0, a3
+; RV64I-NEXT:    negw a6, a3
+; RV64I-NEXT:    sll a6, a1, a6
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    beqz a1, .LBB6_4
+; RV64I-NEXT:    beqz a3, .LBB6_4
 ; RV64I-NEXT:  .LBB6_3:
 ; RV64I-NEXT:    mv a0, a5
 ; RV64I-NEXT:  .LBB6_4:
-; RV64I-NEXT:    bltu a1, a4, .LBB6_6
+; RV64I-NEXT:    bltu a3, a4, .LBB6_6
 ; RV64I-NEXT:  # %bb.5:
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    j .LBB6_7
 ; RV64I-NEXT:  .LBB6_6:
-; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    srl a1, a1, a3
 ; RV64I-NEXT:  .LBB6_7:
 ; RV64I-NEXT:    srli a3, a0, 32
 ; RV64I-NEXT:    srliw a4, a0, 16
@@ -814,8 +812,6 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_16bytes:
@@ -833,42 +829,42 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli s0, t3, 8
 ; RV32I-NEXT:    or t3, a7, a6
 ; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu a7, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or t6, a7, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    or t4, a0, t4
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    lbu s0, 2(a1)
+; RV32I-NEXT:    lbu t6, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or t2, a0, t2
 ; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or s1, a7, a6
+; RV32I-NEXT:    or s0, a7, a0
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s0, a1, s0
+; RV32I-NEXT:    or t6, a1, t6
 ; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    slli a1, a5, 8
 ; RV32I-NEXT:    slli a0, t0, 8
 ; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    or a6, t5, t4
-; RV32I-NEXT:    or t0, t2, t6
-; RV32I-NEXT:    or a5, s0, s1
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or a6, t5, a6
+; RV32I-NEXT:    or t0, t4, t2
+; RV32I-NEXT:    or a5, t6, s0
 ; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    srl t2, a6, a5
 ; RV32I-NEXT:    neg t5, a5
@@ -1019,8 +1015,6 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1035,81 +1029,81 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a0, 32
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    slli t0, a1, 32
 ; RV64I-NEXT:    or a0, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a3, t0, a6
+; RV64I-NEXT:    slli a3, a3, 5
 ; RV64I-NEXT:    li a4, 64
-; RV64I-NEXT:    or a3, a6, a7
-; RV64I-NEXT:    bltu a1, a4, .LBB7_2
+; RV64I-NEXT:    bltu a3, a4, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
-; RV64I-NEXT:    srl a5, a3, a5
-; RV64I-NEXT:    bnez a1, .LBB7_3
+; RV64I-NEXT:    subw a5, a3, a4
+; RV64I-NEXT:    srl a5, a1, a5
+; RV64I-NEXT:    bnez a3, .LBB7_3
 ; RV64I-NEXT:    j .LBB7_4
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
-; RV64I-NEXT:    sll a6, a3, a6
+; RV64I-NEXT:    srl a5, a0, a3
+; RV64I-NEXT:    negw a6, a3
+; RV64I-NEXT:    sll a6, a1, a6
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    beqz a1, .LBB7_4
+; RV64I-NEXT:    beqz a3, .LBB7_4
 ; RV64I-NEXT:  .LBB7_3:
 ; RV64I-NEXT:    mv a0, a5
 ; RV64I-NEXT:  .LBB7_4:
-; RV64I-NEXT:    bltu a1, a4, .LBB7_6
+; RV64I-NEXT:    bltu a3, a4, .LBB7_6
 ; RV64I-NEXT:  # %bb.5:
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    j .LBB7_7
 ; RV64I-NEXT:  .LBB7_6:
-; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    srl a1, a1, a3
 ; RV64I-NEXT:  .LBB7_7:
 ; RV64I-NEXT:    srli a3, a0, 32
 ; RV64I-NEXT:    srliw a4, a0, 16
@@ -1147,8 +1141,6 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_16bytes_wordOff:
@@ -1166,42 +1158,42 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli s0, t3, 8
 ; RV32I-NEXT:    or t3, a7, a6
 ; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu a7, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or t6, a7, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    or t4, a0, t4
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    lbu s0, 2(a1)
+; RV32I-NEXT:    lbu t6, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or t2, a0, t2
 ; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or s1, a7, a6
+; RV32I-NEXT:    or s0, a7, a0
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s0, a1, s0
+; RV32I-NEXT:    or t6, a1, t6
 ; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    slli a1, a5, 8
 ; RV32I-NEXT:    slli a0, t0, 8
 ; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    or a6, t5, t4
-; RV32I-NEXT:    or t0, t2, t6
-; RV32I-NEXT:    or a5, s0, s1
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or a6, t5, a6
+; RV32I-NEXT:    or t0, t4, t2
+; RV32I-NEXT:    or a5, t6, s0
 ; RV32I-NEXT:    slli a5, a5, 5
 ; RV32I-NEXT:    srl t2, a6, a5
 ; RV32I-NEXT:    neg t5, a5
@@ -1352,8 +1344,6 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1368,60 +1358,60 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a4, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a3, a1, 3
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a3, a1, a6
+; RV64I-NEXT:    slli a3, a3, 3
 ; RV64I-NEXT:    li a5, 64
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    bltu a3, a5, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 0
@@ -1475,8 +1465,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_16bytes:
@@ -1485,34 +1473,34 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    lbu t0, 1(a1)
 ; RV32I-NEXT:    lbu t1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, a6
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t2, t0, a6
 ; RV32I-NEXT:    li a6, 64
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    li t1, 32
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli t2, t0, 16
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    or t0, a4, a3
-; RV32I-NEXT:    or a4, t2, a5
-; RV32I-NEXT:    or a5, a1, a7
+; RV32I-NEXT:    or t0, a5, a3
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a5, a1, t2
 ; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    neg t3, a5
 ; RV32I-NEXT:    srl t4, t0, t3
@@ -1533,8 +1521,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
 ; RV32I-NEXT:    lbu s1, 15(a0)
 ; RV32I-NEXT:    sub a7, a6, a5
 ; RV32I-NEXT:    mv a3, a4
@@ -1542,11 +1529,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  # %bb.4:
 ; RV32I-NEXT:    mv a3, t5
 ; RV32I-NEXT:  .LBB8_5:
-; RV32I-NEXT:    lbu s2, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu s0, 13(a0)
-; RV32I-NEXT:    lbu t6, 14(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s2, 10(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    bltu a7, t1, .LBB8_7
 ; RV32I-NEXT:  # %bb.6:
@@ -1557,20 +1544,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll s4, a4, s4
 ; RV32I-NEXT:    or s4, t4, s4
 ; RV32I-NEXT:  .LBB8_8:
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    lbu s6, 8(a0)
-; RV32I-NEXT:    lbu s5, 12(a0)
-; RV32I-NEXT:    or s3, s3, t5
-; RV32I-NEXT:    slli t5, s0, 8
-; RV32I-NEXT:    or s1, s1, t6
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or s1, s1, t5
 ; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beqz a7, .LBB8_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv t4, s4
 ; RV32I-NEXT:  .LBB8_10:
-; RV32I-NEXT:    or a0, s2, s6
-; RV32I-NEXT:    slli s0, s3, 16
-; RV32I-NEXT:    or t6, t5, s5
+; RV32I-NEXT:    or a0, s3, s5
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    or t6, t6, s2
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    bltu a7, t1, .LBB8_12
 ; RV32I-NEXT:  # %bb.11:
@@ -1619,7 +1606,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    bltu a5, a6, .LBB8_24
 ; RV32I-NEXT:  # %bb.23:
@@ -1681,8 +1667,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1697,60 +1681,60 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a4, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a3, a1, 5
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a3, a1, a6
+; RV64I-NEXT:    slli a3, a3, 5
 ; RV64I-NEXT:    li a5, 64
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    bltu a3, a5, .LBB9_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 0
@@ -1804,8 +1788,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_16bytes_wordOff:
@@ -1814,34 +1796,34 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    lbu t0, 1(a1)
 ; RV32I-NEXT:    lbu t1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, a6
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t2, t0, a6
 ; RV32I-NEXT:    li a6, 64
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    li t1, 32
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli t2, t0, 16
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    or t0, a4, a3
-; RV32I-NEXT:    or a4, t2, a5
-; RV32I-NEXT:    or a5, a1, a7
+; RV32I-NEXT:    or t0, a5, a3
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a5, a1, t2
 ; RV32I-NEXT:    slli a5, a5, 5
 ; RV32I-NEXT:    neg t3, a5
 ; RV32I-NEXT:    srl t4, t0, t3
@@ -1862,8 +1844,7 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
 ; RV32I-NEXT:    lbu s1, 15(a0)
 ; RV32I-NEXT:    sub a7, a6, a5
 ; RV32I-NEXT:    mv a3, a4
@@ -1871,11 +1852,11 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  # %bb.4:
 ; RV32I-NEXT:    mv a3, t5
 ; RV32I-NEXT:  .LBB9_5:
-; RV32I-NEXT:    lbu s2, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu s0, 13(a0)
-; RV32I-NEXT:    lbu t6, 14(a0)
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s2, 10(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    bltu a7, t1, .LBB9_7
 ; RV32I-NEXT:  # %bb.6:
@@ -1886,20 +1867,20 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sll s4, a4, s4
 ; RV32I-NEXT:    or s4, t4, s4
 ; RV32I-NEXT:  .LBB9_8:
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    lbu s6, 8(a0)
-; RV32I-NEXT:    lbu s5, 12(a0)
-; RV32I-NEXT:    or s3, s3, t5
-; RV32I-NEXT:    slli t5, s0, 8
-; RV32I-NEXT:    or s1, s1, t6
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or s1, s1, t5
 ; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beqz a7, .LBB9_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv t4, s4
 ; RV32I-NEXT:  .LBB9_10:
-; RV32I-NEXT:    or a0, s2, s6
-; RV32I-NEXT:    slli s0, s3, 16
-; RV32I-NEXT:    or t6, t5, s5
+; RV32I-NEXT:    or a0, s3, s5
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    or t6, t6, s2
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    bltu a7, t1, .LBB9_12
 ; RV32I-NEXT:  # %bb.11:
@@ -1948,7 +1929,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    bltu a5, a6, .LBB9_24
 ; RV32I-NEXT:  # %bb.23:
@@ -2011,8 +1991,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2027,81 +2005,81 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a0, 32
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    slli t0, a1, 32
 ; RV64I-NEXT:    or a0, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a3, t0, a6
+; RV64I-NEXT:    slli a3, a3, 3
 ; RV64I-NEXT:    li a4, 64
-; RV64I-NEXT:    or a3, a6, a7
-; RV64I-NEXT:    bltu a1, a4, .LBB10_2
+; RV64I-NEXT:    bltu a3, a4, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
-; RV64I-NEXT:    sra a5, a3, a5
-; RV64I-NEXT:    bnez a1, .LBB10_3
+; RV64I-NEXT:    subw a5, a3, a4
+; RV64I-NEXT:    sra a5, a1, a5
+; RV64I-NEXT:    bnez a3, .LBB10_3
 ; RV64I-NEXT:    j .LBB10_4
 ; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
-; RV64I-NEXT:    sll a6, a3, a6
+; RV64I-NEXT:    srl a5, a0, a3
+; RV64I-NEXT:    negw a6, a3
+; RV64I-NEXT:    sll a6, a1, a6
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    beqz a1, .LBB10_4
+; RV64I-NEXT:    beqz a3, .LBB10_4
 ; RV64I-NEXT:  .LBB10_3:
 ; RV64I-NEXT:    mv a0, a5
 ; RV64I-NEXT:  .LBB10_4:
-; RV64I-NEXT:    bltu a1, a4, .LBB10_6
+; RV64I-NEXT:    bltu a3, a4, .LBB10_6
 ; RV64I-NEXT:  # %bb.5:
-; RV64I-NEXT:    srai a1, a3, 63
+; RV64I-NEXT:    srai a1, a1, 63
 ; RV64I-NEXT:    j .LBB10_7
 ; RV64I-NEXT:  .LBB10_6:
-; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sra a1, a1, a3
 ; RV64I-NEXT:  .LBB10_7:
 ; RV64I-NEXT:    srli a3, a0, 32
 ; RV64I-NEXT:    srliw a4, a0, 16
@@ -2139,8 +2117,6 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_16bytes:
@@ -2158,42 +2134,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu t1, 7(a0)
 ; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli s0, t3, 8
 ; RV32I-NEXT:    or t3, a7, a6
 ; RV32I-NEXT:    or t1, t1, t0
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu a7, 13(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t6, 1(a1)
-; RV32I-NEXT:    lbu s0, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or s1, a0, t0
 ; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t6, t6, a7
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    or t6, t0, a7
+; RV32I-NEXT:    or a7, a0, t4
+; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or s0, t0, a0
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s0, a1, s0
+; RV32I-NEXT:    or t4, a1, t4
 ; RV32I-NEXT:    li t0, 32
 ; RV32I-NEXT:    slli a1, a5, 8
 ; RV32I-NEXT:    slli a0, t2, 8
 ; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or a5, s1, a6
-; RV32I-NEXT:    or a6, s0, t6
+; RV32I-NEXT:    slli a5, a7, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a7, t5, a6
+; RV32I-NEXT:    or a5, a5, t6
+; RV32I-NEXT:    or a6, t4, s0
 ; RV32I-NEXT:    slli a6, a6, 3
 ; RV32I-NEXT:    srl t2, a7, a6
 ; RV32I-NEXT:    neg t6, a6
@@ -2344,8 +2320,6 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2360,81 +2334,81 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    lbu s0, 14(a0)
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or t1, t4, t3
-; RV64I-NEXT:    or t2, t6, t5
-; RV64I-NEXT:    lbu t3, 0(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or t3, t6, t5
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t4, 1(a1)
 ; RV64I-NEXT:    lbu t5, 2(a1)
 ; RV64I-NEXT:    lbu t6, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    or a0, a0, s0
-; RV64I-NEXT:    or a6, t4, t3
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu s0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a7, t4, a7
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    lbu t5, 5(a1)
+; RV64I-NEXT:    lbu t6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t5, t5, 8
+; RV64I-NEXT:    or t4, t5, t4
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or a1, a1, t6
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    or a5, t3, t2
 ; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli t5, t5, 16
-; RV64I-NEXT:    or a5, t5, a6
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a1, a1, t4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a0, 32
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    slli t0, a1, 32
 ; RV64I-NEXT:    or a0, a4, a3
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a3, t0, a6
+; RV64I-NEXT:    slli a3, a3, 5
 ; RV64I-NEXT:    li a4, 64
-; RV64I-NEXT:    or a3, a6, a7
-; RV64I-NEXT:    bltu a1, a4, .LBB11_2
+; RV64I-NEXT:    bltu a3, a4, .LBB11_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
-; RV64I-NEXT:    sra a5, a3, a5
-; RV64I-NEXT:    bnez a1, .LBB11_3
+; RV64I-NEXT:    subw a5, a3, a4
+; RV64I-NEXT:    sra a5, a1, a5
+; RV64I-NEXT:    bnez a3, .LBB11_3
 ; RV64I-NEXT:    j .LBB11_4
 ; RV64I-NEXT:  .LBB11_2:
-; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
-; RV64I-NEXT:    sll a6, a3, a6
+; RV64I-NEXT:    srl a5, a0, a3
+; RV64I-NEXT:    negw a6, a3
+; RV64I-NEXT:    sll a6, a1, a6
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    beqz a1, .LBB11_4
+; RV64I-NEXT:    beqz a3, .LBB11_4
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    mv a0, a5
 ; RV64I-NEXT:  .LBB11_4:
-; RV64I-NEXT:    bltu a1, a4, .LBB11_6
+; RV64I-NEXT:    bltu a3, a4, .LBB11_6
 ; RV64I-NEXT:  # %bb.5:
-; RV64I-NEXT:    srai a1, a3, 63
+; RV64I-NEXT:    srai a1, a1, 63
 ; RV64I-NEXT:    j .LBB11_7
 ; RV64I-NEXT:  .LBB11_6:
-; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sra a1, a1, a3
 ; RV64I-NEXT:  .LBB11_7:
 ; RV64I-NEXT:    srli a3, a0, 32
 ; RV64I-NEXT:    srliw a4, a0, 16
@@ -2472,8 +2446,6 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb a3, 13(a2)
 ; RV64I-NEXT:    sb t4, 14(a2)
 ; RV64I-NEXT:    sb t5, 15(a2)
-; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_16bytes_wordOff:
@@ -2491,42 +2463,42 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu t1, 7(a0)
 ; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t3, a7, a6
-; RV32I-NEXT:    or t1, t1, t0
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu a7, 13(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t6, 1(a1)
-; RV32I-NEXT:    lbu s0, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or s1, a0, t0
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli s0, t3, 8
+; RV32I-NEXT:    or t3, a7, a6
+; RV32I-NEXT:    or t1, t1, t0
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
 ; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or t6, t6, a7
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    or t6, t0, a7
+; RV32I-NEXT:    or a7, a0, t4
+; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or s0, t0, a0
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or s0, a1, s0
+; RV32I-NEXT:    or t4, a1, t4
 ; RV32I-NEXT:    li t0, 32
 ; RV32I-NEXT:    slli a1, a5, 8
 ; RV32I-NEXT:    slli a0, t2, 8
 ; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    or a7, t5, t4
-; RV32I-NEXT:    or a5, s1, a6
-; RV32I-NEXT:    or a6, s0, t6
+; RV32I-NEXT:    slli a5, a7, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a7, t5, a6
+; RV32I-NEXT:    or a5, a5, t6
+; RV32I-NEXT:    or a6, t4, s0
 ; RV32I-NEXT:    slli a6, a6, 5
 ; RV32I-NEXT:    srl t2, a7, a6
 ; RV32I-NEXT:    neg t6, a6
@@ -2713,88 +2685,88 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or t0, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t3, s0, t5
+; RV64I-NEXT:    or t0, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t3, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li a7, 64
 ; RV64I-NEXT:    slli t4, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a6, t1, t0
-; RV64I-NEXT:    or t0, t5, t3
-; RV64I-NEXT:    or a5, s0, t6
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t3, t3, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a6, t0, s0
+; RV64I-NEXT:    or t0, t3, t1
+; RV64I-NEXT:    or a5, t6, t5
 ; RV64I-NEXT:    slli a5, a5, 3
 ; RV64I-NEXT:    subw t1, a5, a7
 ; RV64I-NEXT:    negw t5, a5
@@ -3008,49 +2980,49 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a6, a6, a5
 ; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a4, t2, t1
-; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t4, t4, t3
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    lbu t0, 28(a0)
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    lbu t2, 0(a1)
 ; RV32I-NEXT:    lbu t3, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t5, a1, t5
 ; RV32I-NEXT:    li t3, 32
 ; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a1, a4, 16
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli a4, t1, 16
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    or t1, t5, t4
-; RV32I-NEXT:    or t5, a4, a7
-; RV32I-NEXT:    or a4, t2, t0
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a7, t1, 16
+; RV32I-NEXT:    slli t6, t5, 16
+; RV32I-NEXT:    or t1, t4, a4
+; RV32I-NEXT:    or t5, a7, t0
+; RV32I-NEXT:    or a4, t6, t2
 ; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    srl s0, t1, a4
+; RV32I-NEXT:    srl s1, t1, a4
 ; RV32I-NEXT:    neg s6, a4
 ; RV32I-NEXT:    sll t4, t5, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB12_2
@@ -3058,7 +3030,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl a7, t5, a4
 ; RV32I-NEXT:    j .LBB12_3
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    or a7, s0, t4
+; RV32I-NEXT:    or a7, s1, t4
 ; RV32I-NEXT:  .LBB12_3:
 ; RV32I-NEXT:    or t0, a6, a3
 ; RV32I-NEXT:    or a6, a1, a5
@@ -3072,11 +3044,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t3, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    li ra, 0
+; RV32I-NEXT:    li s0, 0
 ; RV32I-NEXT:    srl a3, a6, a4
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    srl ra, t5, a4
+; RV32I-NEXT:    srl s0, t5, a4
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:  .LBB12_8:
 ; RV32I-NEXT:    li t6, 64
@@ -3113,29 +3085,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    mv t4, a3
 ; RV32I-NEXT:  .LBB12_18:
 ; RV32I-NEXT:    neg s11, s9
-; RV32I-NEXT:    sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB12_20
 ; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    srl s2, t5, s9
+; RV32I-NEXT:    srl s1, t5, s9
 ; RV32I-NEXT:    j .LBB12_21
 ; RV32I-NEXT:  .LBB12_20:
 ; RV32I-NEXT:    sll a3, t5, s11
-; RV32I-NEXT:    or s2, s0, a3
+; RV32I-NEXT:    or s1, s1, a3
 ; RV32I-NEXT:  .LBB12_21:
-; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s2, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv s0, t1
 ; RV32I-NEXT:    beqz s9, .LBB12_23
 ; RV32I-NEXT:  # %bb.22:
-; RV32I-NEXT:    mv s0, s2
+; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:  .LBB12_23:
-; RV32I-NEXT:    lbu s4, 9(a0)
-; RV32I-NEXT:    lbu s2, 10(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    slli s3, s1, 8
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s8, 13(a0)
+; RV32I-NEXT:    lbu ra, 14(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    sw ra, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB12_25
 ; RV32I-NEXT:  # %bb.24:
 ; RV32I-NEXT:    li s1, 0
@@ -3143,12 +3115,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB12_25:
 ; RV32I-NEXT:    srl s1, t5, a4
 ; RV32I-NEXT:  .LBB12_26:
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    lbu ra, 8(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or s8, a3, s8
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s2, s4
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s4, 12(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a4, t6, .LBB12_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or s0, a7, t2
@@ -3156,10 +3128,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB12_28:
 ; RV32I-NEXT:    lbu a3, 3(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    or a5, s4, ra
+; RV32I-NEXT:    or a5, s3, s5
 ; RV32I-NEXT:    slli t4, s2, 16
-; RV32I-NEXT:    or s2, s5, s3
-; RV32I-NEXT:    slli s3, s8, 16
+; RV32I-NEXT:    or s2, s8, s4
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv s4, t0
 ; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a4, .LBB12_30
@@ -3167,25 +3139,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    mv s4, s0
 ; RV32I-NEXT:    mv a7, s1
 ; RV32I-NEXT:  .LBB12_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
+; RV32I-NEXT:    slli s3, a3, 8
+; RV32I-NEXT:    lbu s8, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu s1, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s8, t2, 8
+; RV32I-NEXT:    slli s5, t2, 8
 ; RV32I-NEXT:    or t4, t4, a5
-; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t2, ra, s2
 ; RV32I-NEXT:    bltu a4, t6, .LBB12_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB12_32:
-; RV32I-NEXT:    slli s3, ra, 8
-; RV32I-NEXT:    or a5, s5, a3
-; RV32I-NEXT:    lbu s5, 0(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a5, s3, a3
+; RV32I-NEXT:    lbu s3, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s8, s0
+; RV32I-NEXT:    or a3, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a4
 ; RV32I-NEXT:    sll ra, t2, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB12_34
@@ -3195,7 +3167,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB12_34:
 ; RV32I-NEXT:    or s0, s2, ra
 ; RV32I-NEXT:  .LBB12_35:
-; RV32I-NEXT:    or s3, s3, s5
+; RV32I-NEXT:    or s3, s8, s3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    slli a3, a3, 16
@@ -3637,88 +3609,88 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or t0, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t3, s0, t5
+; RV64I-NEXT:    or t0, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t3, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li a7, 64
 ; RV64I-NEXT:    slli t4, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a6, t1, t0
-; RV64I-NEXT:    or t0, t5, t3
-; RV64I-NEXT:    or a5, s0, t6
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t3, t3, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a6, t0, s0
+; RV64I-NEXT:    or t0, t3, t1
+; RV64I-NEXT:    or a5, t6, t5
 ; RV64I-NEXT:    slli a5, a5, 5
 ; RV64I-NEXT:    subw t1, a5, a7
 ; RV64I-NEXT:    negw t5, a5
@@ -3932,49 +3904,49 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a6, a6, a5
 ; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a4, t2, t1
-; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t4, t4, t3
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    lbu t0, 28(a0)
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    lbu t2, 0(a1)
 ; RV32I-NEXT:    lbu t3, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t5, a1, t5
 ; RV32I-NEXT:    li t3, 32
 ; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a1, a4, 16
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli a4, t1, 16
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    or t1, t5, t4
-; RV32I-NEXT:    or t5, a4, a7
-; RV32I-NEXT:    or a4, t2, t0
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a7, t1, 16
+; RV32I-NEXT:    slli t6, t5, 16
+; RV32I-NEXT:    or t1, t4, a4
+; RV32I-NEXT:    or t5, a7, t0
+; RV32I-NEXT:    or a4, t6, t2
 ; RV32I-NEXT:    slli a4, a4, 5
-; RV32I-NEXT:    srl s0, t1, a4
+; RV32I-NEXT:    srl s1, t1, a4
 ; RV32I-NEXT:    neg s6, a4
 ; RV32I-NEXT:    sll t4, t5, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB13_2
@@ -3982,7 +3954,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    srl a7, t5, a4
 ; RV32I-NEXT:    j .LBB13_3
 ; RV32I-NEXT:  .LBB13_2:
-; RV32I-NEXT:    or a7, s0, t4
+; RV32I-NEXT:    or a7, s1, t4
 ; RV32I-NEXT:  .LBB13_3:
 ; RV32I-NEXT:    or t0, a6, a3
 ; RV32I-NEXT:    or a6, a1, a5
@@ -3996,11 +3968,11 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sw a3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t3, .LBB13_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    li ra, 0
+; RV32I-NEXT:    li s0, 0
 ; RV32I-NEXT:    srl a3, a6, a4
 ; RV32I-NEXT:    j .LBB13_8
 ; RV32I-NEXT:  .LBB13_7:
-; RV32I-NEXT:    srl ra, t5, a4
+; RV32I-NEXT:    srl s0, t5, a4
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:  .LBB13_8:
 ; RV32I-NEXT:    li t6, 64
@@ -4037,29 +4009,29 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    mv t4, a3
 ; RV32I-NEXT:  .LBB13_18:
 ; RV32I-NEXT:    neg s11, s9
-; RV32I-NEXT:    sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB13_20
 ; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    srl s2, t5, s9
+; RV32I-NEXT:    srl s1, t5, s9
 ; RV32I-NEXT:    j .LBB13_21
 ; RV32I-NEXT:  .LBB13_20:
 ; RV32I-NEXT:    sll a3, t5, s11
-; RV32I-NEXT:    or s2, s0, a3
+; RV32I-NEXT:    or s1, s1, a3
 ; RV32I-NEXT:  .LBB13_21:
-; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s2, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv s0, t1
 ; RV32I-NEXT:    beqz s9, .LBB13_23
 ; RV32I-NEXT:  # %bb.22:
-; RV32I-NEXT:    mv s0, s2
+; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:  .LBB13_23:
-; RV32I-NEXT:    lbu s4, 9(a0)
-; RV32I-NEXT:    lbu s2, 10(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    slli s3, s1, 8
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s8, 13(a0)
+; RV32I-NEXT:    lbu ra, 14(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    sw ra, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB13_25
 ; RV32I-NEXT:  # %bb.24:
 ; RV32I-NEXT:    li s1, 0
@@ -4067,12 +4039,12 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB13_25:
 ; RV32I-NEXT:    srl s1, t5, a4
 ; RV32I-NEXT:  .LBB13_26:
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    lbu ra, 8(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or s8, a3, s8
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s2, s4
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s4, 12(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a4, t6, .LBB13_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or s0, a7, t2
@@ -4080,10 +4052,10 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB13_28:
 ; RV32I-NEXT:    lbu a3, 3(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    or a5, s4, ra
+; RV32I-NEXT:    or a5, s3, s5
 ; RV32I-NEXT:    slli t4, s2, 16
-; RV32I-NEXT:    or s2, s5, s3
-; RV32I-NEXT:    slli s3, s8, 16
+; RV32I-NEXT:    or s2, s8, s4
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv s4, t0
 ; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a4, .LBB13_30
@@ -4091,25 +4063,25 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    mv s4, s0
 ; RV32I-NEXT:    mv a7, s1
 ; RV32I-NEXT:  .LBB13_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
+; RV32I-NEXT:    slli s3, a3, 8
+; RV32I-NEXT:    lbu s8, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu s1, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s8, t2, 8
+; RV32I-NEXT:    slli s5, t2, 8
 ; RV32I-NEXT:    or t4, t4, a5
-; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t2, ra, s2
 ; RV32I-NEXT:    bltu a4, t6, .LBB13_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB13_32:
-; RV32I-NEXT:    slli s3, ra, 8
-; RV32I-NEXT:    or a5, s5, a3
-; RV32I-NEXT:    lbu s5, 0(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a5, s3, a3
+; RV32I-NEXT:    lbu s3, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s8, s0
+; RV32I-NEXT:    or a3, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a4
 ; RV32I-NEXT:    sll ra, t2, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB13_34
@@ -4119,7 +4091,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB13_34:
 ; RV32I-NEXT:    or s0, s2, ra
 ; RV32I-NEXT:  .LBB13_35:
-; RV32I-NEXT:    or s3, s3, s5
+; RV32I-NEXT:    or s3, s8, s3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    slli a3, a3, 16
@@ -4561,88 +4533,88 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or t0, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t3, s0, t5
+; RV64I-NEXT:    or t0, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t3, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li a7, 64
 ; RV64I-NEXT:    slli t4, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a6, t1, t0
-; RV64I-NEXT:    or t0, t5, t3
-; RV64I-NEXT:    or a5, s0, t6
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t3, t3, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a6, t0, s0
+; RV64I-NEXT:    or t0, t3, t1
+; RV64I-NEXT:    or a5, t6, t5
 ; RV64I-NEXT:    slli a5, a5, 6
 ; RV64I-NEXT:    subw t1, a5, a7
 ; RV64I-NEXT:    negw t5, a5
@@ -4856,49 +4828,49 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a6, a6, a5
 ; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a4, t2, t1
-; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t4, t4, t3
-; RV32I-NEXT:    or t5, t6, t5
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    lbu t0, 28(a0)
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    lbu t2, 0(a1)
 ; RV32I-NEXT:    lbu t3, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t5, a1, t5
 ; RV32I-NEXT:    li t3, 32
 ; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a1, a4, 16
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli a4, t1, 16
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    or t1, t5, t4
-; RV32I-NEXT:    or t5, a4, a7
-; RV32I-NEXT:    or a4, t2, t0
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a7, t1, 16
+; RV32I-NEXT:    slli t6, t5, 16
+; RV32I-NEXT:    or t1, t4, a4
+; RV32I-NEXT:    or t5, a7, t0
+; RV32I-NEXT:    or a4, t6, t2
 ; RV32I-NEXT:    slli a4, a4, 6
-; RV32I-NEXT:    srl s0, t1, a4
+; RV32I-NEXT:    srl s1, t1, a4
 ; RV32I-NEXT:    neg s6, a4
 ; RV32I-NEXT:    sll t4, t5, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB14_2
@@ -4906,7 +4878,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    srl a7, t5, a4
 ; RV32I-NEXT:    j .LBB14_3
 ; RV32I-NEXT:  .LBB14_2:
-; RV32I-NEXT:    or a7, s0, t4
+; RV32I-NEXT:    or a7, s1, t4
 ; RV32I-NEXT:  .LBB14_3:
 ; RV32I-NEXT:    or t0, a6, a3
 ; RV32I-NEXT:    or a6, a1, a5
@@ -4920,11 +4892,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sw a3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t3, .LBB14_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    li ra, 0
+; RV32I-NEXT:    li s0, 0
 ; RV32I-NEXT:    srl a3, a6, a4
 ; RV32I-NEXT:    j .LBB14_8
 ; RV32I-NEXT:  .LBB14_7:
-; RV32I-NEXT:    srl ra, t5, a4
+; RV32I-NEXT:    srl s0, t5, a4
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:  .LBB14_8:
 ; RV32I-NEXT:    li t6, 64
@@ -4961,29 +4933,29 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    mv t4, a3
 ; RV32I-NEXT:  .LBB14_18:
 ; RV32I-NEXT:    neg s11, s9
-; RV32I-NEXT:    sw s0, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB14_20
 ; RV32I-NEXT:  # %bb.19:
-; RV32I-NEXT:    srl s2, t5, s9
+; RV32I-NEXT:    srl s1, t5, s9
 ; RV32I-NEXT:    j .LBB14_21
 ; RV32I-NEXT:  .LBB14_20:
 ; RV32I-NEXT:    sll a3, t5, s11
-; RV32I-NEXT:    or s2, s0, a3
+; RV32I-NEXT:    or s1, s1, a3
 ; RV32I-NEXT:  .LBB14_21:
-; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s2, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv s0, t1
 ; RV32I-NEXT:    beqz s9, .LBB14_23
 ; RV32I-NEXT:  # %bb.22:
-; RV32I-NEXT:    mv s0, s2
+; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:  .LBB14_23:
-; RV32I-NEXT:    lbu s4, 9(a0)
-; RV32I-NEXT:    lbu s2, 10(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    slli s3, s1, 8
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s8, 13(a0)
+; RV32I-NEXT:    lbu ra, 14(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    sw ra, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t3, .LBB14_25
 ; RV32I-NEXT:  # %bb.24:
 ; RV32I-NEXT:    li s1, 0
@@ -4991,12 +4963,12 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB14_25:
 ; RV32I-NEXT:    srl s1, t5, a4
 ; RV32I-NEXT:  .LBB14_26:
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    lbu ra, 8(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or s8, a3, s8
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s2, s4
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s4, 12(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a4, t6, .LBB14_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or s0, a7, t2
@@ -5004,10 +4976,10 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB14_28:
 ; RV32I-NEXT:    lbu a3, 3(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    or a5, s4, ra
+; RV32I-NEXT:    or a5, s3, s5
 ; RV32I-NEXT:    slli t4, s2, 16
-; RV32I-NEXT:    or s2, s5, s3
-; RV32I-NEXT:    slli s3, s8, 16
+; RV32I-NEXT:    or s2, s8, s4
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv s4, t0
 ; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a4, .LBB14_30
@@ -5015,25 +4987,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    mv s4, s0
 ; RV32I-NEXT:    mv a7, s1
 ; RV32I-NEXT:  .LBB14_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
+; RV32I-NEXT:    slli s3, a3, 8
+; RV32I-NEXT:    lbu s8, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu s1, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s8, t2, 8
+; RV32I-NEXT:    slli s5, t2, 8
 ; RV32I-NEXT:    or t4, t4, a5
-; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t2, ra, s2
 ; RV32I-NEXT:    bltu a4, t6, .LBB14_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB14_32:
-; RV32I-NEXT:    slli s3, ra, 8
-; RV32I-NEXT:    or a5, s5, a3
-; RV32I-NEXT:    lbu s5, 0(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a5, s3, a3
+; RV32I-NEXT:    lbu s3, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s8, s0
+; RV32I-NEXT:    or a3, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a4
 ; RV32I-NEXT:    sll ra, t2, s6
 ; RV32I-NEXT:    bltu a4, t3, .LBB14_34
@@ -5043,7 +5015,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB14_34:
 ; RV32I-NEXT:    or s0, s2, ra
 ; RV32I-NEXT:  .LBB14_35:
-; RV32I-NEXT:    or s3, s3, s5
+; RV32I-NEXT:    or s3, s8, s3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    slli a3, a3, 16
@@ -5784,54 +5756,54 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
 ; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    li s9, 64
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    li t4, 32
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or t3, a5, a4
-; RV32I-NEXT:    or a5, t2, a7
-; RV32I-NEXT:    or a4, a1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a4, a1, t1
 ; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    neg s10, a4
-; RV32I-NEXT:    srl t5, t3, s10
-; RV32I-NEXT:    sll s5, a5, a4
+; RV32I-NEXT:    neg s5, a4
+; RV32I-NEXT:    srl t5, t3, s5
+; RV32I-NEXT:    sll s10, a5, a4
 ; RV32I-NEXT:    bltu a4, t4, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li s8, 0
-; RV32I-NEXT:    sll a7, t3, a4
+; RV32I-NEXT:    sll t0, t3, a4
 ; RV32I-NEXT:    j .LBB15_3
 ; RV32I-NEXT:  .LBB15_2:
 ; RV32I-NEXT:    sll s8, t3, a4
-; RV32I-NEXT:    or a7, t5, s5
+; RV32I-NEXT:    or t0, t5, s10
 ; RV32I-NEXT:  .LBB15_3:
+; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    lbu t2, 9(a0)
-; RV32I-NEXT:    lbu a1, 10(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
 ; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    lbu a1, 14(a0)
 ; RV32I-NEXT:    slli t6, a3, 8
 ; RV32I-NEXT:    sub s6, s9, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a4, .LBB15_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a3, t0
 ; RV32I-NEXT:  .LBB15_5:
-; RV32I-NEXT:    slli a7, t2, 8
-; RV32I-NEXT:    or a6, a6, a1
+; RV32I-NEXT:    slli t0, t2, 8
+; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu a1, 12(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    or a1, t6, a1
 ; RV32I-NEXT:    neg t6, s6
 ; RV32I-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s6, t4, .LBB15_7
@@ -5842,25 +5814,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll t6, a5, t6
 ; RV32I-NEXT:    or t6, t5, t6
 ; RV32I-NEXT:  .LBB15_8:
-; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    or t0, t0, t2
 ; RV32I-NEXT:    slli t2, a6, 16
-; RV32I-NEXT:    or a1, t1, a1
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    mv a6, t3
 ; RV32I-NEXT:    beqz s6, .LBB15_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a6, t6
 ; RV32I-NEXT:  .LBB15_10:
-; RV32I-NEXT:    or t1, t2, a7
-; RV32I-NEXT:    or t2, t0, a1
+; RV32I-NEXT:    or t1, t2, t0
+; RV32I-NEXT:    or t2, a1, a7
 ; RV32I-NEXT:    bltu s6, t4, .LBB15_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    j .LBB15_13
 ; RV32I-NEXT:  .LBB15_12:
-; RV32I-NEXT:    srl a7, a5, s10
+; RV32I-NEXT:    srl t0, a5, s5
 ; RV32I-NEXT:  .LBB15_13:
-; RV32I-NEXT:    srl s0, t1, s10
+; RV32I-NEXT:    srl s0, t1, s5
 ; RV32I-NEXT:    sll a1, t2, a4
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB15_15
@@ -5890,7 +5862,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB15_20:
 ; RV32I-NEXT:    sll s2, t3, a4
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    or a1, a1, s5
+; RV32I-NEXT:    or a1, a1, s10
 ; RV32I-NEXT:    mv s4, a5
 ; RV32I-NEXT:    beqz s7, .LBB15_22
 ; RV32I-NEXT:  .LBB15_21:
@@ -5905,7 +5877,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB15_24:
 ; RV32I-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or s2, a6, s1
-; RV32I-NEXT:    or s4, a7, s3
+; RV32I-NEXT:    or s4, t0, s3
 ; RV32I-NEXT:  .LBB15_25:
 ; RV32I-NEXT:    sub ra, a1, a4
 ; RV32I-NEXT:    mv a7, t1
@@ -5920,15 +5892,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    bltu ra, t4, .LBB15_29
 ; RV32I-NEXT:  # %bb.28:
 ; RV32I-NEXT:    srl a1, t2, ra
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    bnez ra, .LBB15_30
 ; RV32I-NEXT:    j .LBB15_31
 ; RV32I-NEXT:  .LBB15_29:
 ; RV32I-NEXT:    or a1, s0, s2
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    beqz ra, .LBB15_31
 ; RV32I-NEXT:  .LBB15_30:
-; RV32I-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a1
 ; RV32I-NEXT:  .LBB15_31:
 ; RV32I-NEXT:    bltu ra, t4, .LBB15_33
 ; RV32I-NEXT:  # %bb.32:
@@ -5938,7 +5910,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    bnez ra, .LBB15_34
 ; RV32I-NEXT:    j .LBB15_35
 ; RV32I-NEXT:  .LBB15_33:
-; RV32I-NEXT:    srl a1, t2, s10
+; RV32I-NEXT:    srl a1, t2, s5
 ; RV32I-NEXT:    sw a1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sll a1, a5, s1
 ; RV32I-NEXT:    or a1, t5, a1
@@ -5959,7 +5931,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    j .LBB15_40
 ; RV32I-NEXT:  .LBB15_38:
-; RV32I-NEXT:    srl a1, a5, s10
+; RV32I-NEXT:    srl a1, a5, s5
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s3, t4, .LBB15_37
 ; RV32I-NEXT:  .LBB15_39:
@@ -5972,35 +5944,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  # %bb.41:
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:  .LBB15_42:
-; RV32I-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s5, a7
+; RV32I-NEXT:    sw t0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    bltu s4, t4, .LBB15_44
 ; RV32I-NEXT:  # %bb.43:
-; RV32I-NEXT:    srl t0, t2, s4
+; RV32I-NEXT:    srl a7, t2, s4
 ; RV32I-NEXT:    j .LBB15_45
 ; RV32I-NEXT:  .LBB15_44:
 ; RV32I-NEXT:    srl a1, t1, ra
-; RV32I-NEXT:    neg t0, s4
-; RV32I-NEXT:    sll t0, t2, t0
-; RV32I-NEXT:    or t0, a1, t0
+; RV32I-NEXT:    neg a7, s4
+; RV32I-NEXT:    sll a7, t2, a7
+; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:  .LBB15_45:
-; RV32I-NEXT:    mv s0, s10
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    sw s10, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li s0, 64
+; RV32I-NEXT:    lbu t6, 19(a0)
 ; RV32I-NEXT:    lbu a1, 23(a0)
 ; RV32I-NEXT:    mv s3, t1
 ; RV32I-NEXT:    beqz s4, .LBB15_47
 ; RV32I-NEXT:  # %bb.46:
-; RV32I-NEXT:    mv s3, t0
+; RV32I-NEXT:    mv s3, a7
 ; RV32I-NEXT:  .LBB15_47:
-; RV32I-NEXT:    mv a6, a3
-; RV32I-NEXT:    lbu s10, 17(a0)
-; RV32I-NEXT:    lbu t0, 18(a0)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu t6, 22(a0)
-; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    lbu s8, 22(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    li a3, 64
 ; RV32I-NEXT:    bltu s4, t4, .LBB15_49
 ; RV32I-NEXT:  # %bb.48:
 ; RV32I-NEXT:    li s4, 0
@@ -6008,45 +5978,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB15_49:
 ; RV32I-NEXT:    srl s4, t2, ra
 ; RV32I-NEXT:  .LBB15_50:
-; RV32I-NEXT:    or s11, s8, t0
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or s10, t6, a7
+; RV32I-NEXT:    lbu a7, 16(a0)
+; RV32I-NEXT:    lbu t6, 20(a0)
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t6, a1, t6
-; RV32I-NEXT:    bgeu ra, a3, .LBB15_52
+; RV32I-NEXT:    or s8, a1, s8
+; RV32I-NEXT:    bgeu ra, s0, .LBB15_52
 ; RV32I-NEXT:  # %bb.51:
 ; RV32I-NEXT:    or s3, t5, s1
 ; RV32I-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    or s4, a1, s2
 ; RV32I-NEXT:  .LBB15_52:
-; RV32I-NEXT:    or a1, s10, t0
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    or t0, s9, s8
-; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or a1, s11, a7
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    or a7, s9, t6
+; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    mv t5, t3
-; RV32I-NEXT:    mv s1, a5
-; RV32I-NEXT:    mv a3, a6
+; RV32I-NEXT:    mv t6, a5
 ; RV32I-NEXT:    beqz ra, .LBB15_54
 ; RV32I-NEXT:  # %bb.53:
 ; RV32I-NEXT:    mv t5, s3
-; RV32I-NEXT:    mv s1, s4
+; RV32I-NEXT:    mv t6, s4
 ; RV32I-NEXT:  .LBB15_54:
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or s2, s11, a1
-; RV32I-NEXT:    or s1, t6, t0
+; RV32I-NEXT:    or s2, s10, a1
+; RV32I-NEXT:    or s1, s8, a7
 ; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    mv a6, a7
-; RV32I-NEXT:    mv a7, s0
 ; RV32I-NEXT:    bltu ra, a1, .LBB15_56
 ; RV32I-NEXT:  # %bb.55:
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw zero, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB15_56:
-; RV32I-NEXT:    srl s3, s2, a7
-; RV32I-NEXT:    sll ra, s1, a4
-; RV32I-NEXT:    mv a7, s5
+; RV32I-NEXT:    srl s3, s2, s5
+; RV32I-NEXT:    sll s0, s1, a4
 ; RV32I-NEXT:    sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t6, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB15_58
 ; RV32I-NEXT:  # %bb.57:
 ; RV32I-NEXT:    sw zero, 32(sp) # 4-byte Folded Spill
@@ -6055,54 +6021,54 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB15_58:
 ; RV32I-NEXT:    sll a1, s2, a4
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a1, s3, ra
+; RV32I-NEXT:    or a1, s3, s0
 ; RV32I-NEXT:  .LBB15_59:
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
 ; RV32I-NEXT:    lbu t6, 31(a0)
 ; RV32I-NEXT:    mv t5, s1
 ; RV32I-NEXT:    beqz a4, .LBB15_61
 ; RV32I-NEXT:  # %bb.60:
 ; RV32I-NEXT:    mv t5, a1
 ; RV32I-NEXT:  .LBB15_61:
-; RV32I-NEXT:    lbu s8, 25(a0)
-; RV32I-NEXT:    lbu s4, 26(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
-; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s10, 29(a0)
+; RV32I-NEXT:    lbu s4, 30(a0)
 ; RV32I-NEXT:    slli t6, t6, 8
 ; RV32I-NEXT:    bltu s6, t4, .LBB15_63
 ; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    srl t0, s1, s6
+; RV32I-NEXT:    srl a7, s1, s6
 ; RV32I-NEXT:    j .LBB15_64
 ; RV32I-NEXT:  .LBB15_63:
 ; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sll a1, s1, a1
-; RV32I-NEXT:    or t0, s3, a1
+; RV32I-NEXT:    or a7, s3, a1
 ; RV32I-NEXT:  .LBB15_64:
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    lbu s3, 24(a0)
-; RV32I-NEXT:    lbu a1, 28(a0)
-; RV32I-NEXT:    or s4, s9, s4
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t6, t6, s10
+; RV32I-NEXT:    slli s3, s9, 8
+; RV32I-NEXT:    or a1, s11, s8
+; RV32I-NEXT:    lbu s11, 24(a0)
+; RV32I-NEXT:    lbu s8, 28(a0)
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t6, t6, s4
 ; RV32I-NEXT:    mv s9, s2
 ; RV32I-NEXT:    beqz s6, .LBB15_66
 ; RV32I-NEXT:  # %bb.65:
-; RV32I-NEXT:    mv s9, t0
+; RV32I-NEXT:    mv s9, a7
 ; RV32I-NEXT:  .LBB15_66:
-; RV32I-NEXT:    or a0, s8, s3
-; RV32I-NEXT:    slli t0, s4, 16
-; RV32I-NEXT:    or a1, s11, a1
+; RV32I-NEXT:    or a0, s3, s11
+; RV32I-NEXT:    slli a7, a1, 16
+; RV32I-NEXT:    or a1, s10, s8
 ; RV32I-NEXT:    slli t6, t6, 16
 ; RV32I-NEXT:    bltu s6, t4, .LBB15_68
 ; RV32I-NEXT:  # %bb.67:
 ; RV32I-NEXT:    li s4, 0
 ; RV32I-NEXT:    j .LBB15_69
 ; RV32I-NEXT:  .LBB15_68:
-; RV32I-NEXT:    srl s4, s1, s0
+; RV32I-NEXT:    srl s4, s1, s5
 ; RV32I-NEXT:  .LBB15_69:
 ; RV32I-NEXT:    li s11, 64
-; RV32I-NEXT:    or s6, t0, a0
+; RV32I-NEXT:    or s6, a7, a0
 ; RV32I-NEXT:    or a0, t6, a1
 ; RV32I-NEXT:    bltu a4, t4, .LBB15_71
 ; RV32I-NEXT:  # %bb.70:
@@ -6113,9 +6079,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    j .LBB15_73
 ; RV32I-NEXT:  .LBB15_71:
 ; RV32I-NEXT:    sll s3, s6, a4
-; RV32I-NEXT:    srl a1, s6, s0
-; RV32I-NEXT:    sll t0, a0, a4
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    srl a1, s6, s5
+; RV32I-NEXT:    sll a7, a0, a4
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    mv s10, a0
 ; RV32I-NEXT:    beqz a4, .LBB15_73
 ; RV32I-NEXT:  .LBB15_72:
@@ -6132,7 +6098,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll s5, s2, a4
 ; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    srl a1, s2, a1
-; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:    beqz s7, .LBB15_77
 ; RV32I-NEXT:  .LBB15_76:
@@ -6196,8 +6162,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB15_93:
 ; RV32I-NEXT:    sll s10, t1, a4
 ; RV32I-NEXT:    srl a1, t1, s3
-; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    j .LBB15_96
 ; RV32I-NEXT:  .LBB15_94:
 ; RV32I-NEXT:    srl s4, a5, s3
@@ -6223,8 +6189,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll t4, t3, s9
 ; RV32I-NEXT:    neg a1, s11
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    sll t0, a5, s9
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    sll a7, a5, s9
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    beqz s11, .LBB15_102
 ; RV32I-NEXT:  .LBB15_101:
 ; RV32I-NEXT:    mv a5, a1
@@ -6249,7 +6215,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  # %bb.107:
 ; RV32I-NEXT:    li ra, 0
 ; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    li a6, 0
 ; RV32I-NEXT:    bnez a4, .LBB15_109
 ; RV32I-NEXT:    j .LBB15_110
@@ -6276,8 +6242,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli t1, ra, 24
 ; RV32I-NEXT:    srli a5, a3, 16
 ; RV32I-NEXT:    srli t4, a3, 24
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    srli s0, a7, 24
+; RV32I-NEXT:    srli a7, t0, 16
+; RV32I-NEXT:    srli s0, t0, 24
 ; RV32I-NEXT:    srli t3, a6, 16
 ; RV32I-NEXT:    srli s3, a6, 24
 ; RV32I-NEXT:    srli t6, s2, 16
@@ -6296,7 +6262,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb s10, 1(a2)
 ; RV32I-NEXT:    sb a4, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    and a4, a7, t2
+; RV32I-NEXT:    and a4, t0, t2
 ; RV32I-NEXT:    srli t1, s11, 8
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb t1, 5(a2)
@@ -6304,9 +6270,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t4, 7(a2)
 ; RV32I-NEXT:    and a3, a6, t2
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
 ; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    sb a7, 10(a2)
 ; RV32I-NEXT:    sb s0, 11(a2)
 ; RV32I-NEXT:    and a4, s2, t2
 ; RV32I-NEXT:    srli a3, a3, 8
@@ -6698,54 +6664,54 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
 ; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    li s9, 64
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    li t4, 32
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or t3, a5, a4
-; RV32I-NEXT:    or a5, t2, a7
-; RV32I-NEXT:    or a4, a1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a4, a1, t1
 ; RV32I-NEXT:    slli a4, a4, 5
-; RV32I-NEXT:    neg s10, a4
-; RV32I-NEXT:    srl t5, t3, s10
-; RV32I-NEXT:    sll s5, a5, a4
+; RV32I-NEXT:    neg s5, a4
+; RV32I-NEXT:    srl t5, t3, s5
+; RV32I-NEXT:    sll s10, a5, a4
 ; RV32I-NEXT:    bltu a4, t4, .LBB16_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li s8, 0
-; RV32I-NEXT:    sll a7, t3, a4
+; RV32I-NEXT:    sll t0, t3, a4
 ; RV32I-NEXT:    j .LBB16_3
 ; RV32I-NEXT:  .LBB16_2:
 ; RV32I-NEXT:    sll s8, t3, a4
-; RV32I-NEXT:    or a7, t5, s5
+; RV32I-NEXT:    or t0, t5, s10
 ; RV32I-NEXT:  .LBB16_3:
+; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    lbu t2, 9(a0)
-; RV32I-NEXT:    lbu a1, 10(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
 ; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    lbu a1, 14(a0)
 ; RV32I-NEXT:    slli t6, a3, 8
 ; RV32I-NEXT:    sub s6, s9, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a4, .LBB16_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a3, t0
 ; RV32I-NEXT:  .LBB16_5:
-; RV32I-NEXT:    slli a7, t2, 8
-; RV32I-NEXT:    or a6, a6, a1
+; RV32I-NEXT:    slli t0, t2, 8
+; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu a1, 12(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    or a1, t6, a1
 ; RV32I-NEXT:    neg t6, s6
 ; RV32I-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s6, t4, .LBB16_7
@@ -6756,25 +6722,25 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sll t6, a5, t6
 ; RV32I-NEXT:    or t6, t5, t6
 ; RV32I-NEXT:  .LBB16_8:
-; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    or t0, t0, t2
 ; RV32I-NEXT:    slli t2, a6, 16
-; RV32I-NEXT:    or a1, t1, a1
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    mv a6, t3
 ; RV32I-NEXT:    beqz s6, .LBB16_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a6, t6
 ; RV32I-NEXT:  .LBB16_10:
-; RV32I-NEXT:    or t1, t2, a7
-; RV32I-NEXT:    or t2, t0, a1
+; RV32I-NEXT:    or t1, t2, t0
+; RV32I-NEXT:    or t2, a1, a7
 ; RV32I-NEXT:    bltu s6, t4, .LBB16_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    j .LBB16_13
 ; RV32I-NEXT:  .LBB16_12:
-; RV32I-NEXT:    srl a7, a5, s10
+; RV32I-NEXT:    srl t0, a5, s5
 ; RV32I-NEXT:  .LBB16_13:
-; RV32I-NEXT:    srl s0, t1, s10
+; RV32I-NEXT:    srl s0, t1, s5
 ; RV32I-NEXT:    sll a1, t2, a4
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB16_15
@@ -6804,7 +6770,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  .LBB16_20:
 ; RV32I-NEXT:    sll s2, t3, a4
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    or a1, a1, s5
+; RV32I-NEXT:    or a1, a1, s10
 ; RV32I-NEXT:    mv s4, a5
 ; RV32I-NEXT:    beqz s7, .LBB16_22
 ; RV32I-NEXT:  .LBB16_21:
@@ -6819,7 +6785,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  .LBB16_24:
 ; RV32I-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or s2, a6, s1
-; RV32I-NEXT:    or s4, a7, s3
+; RV32I-NEXT:    or s4, t0, s3
 ; RV32I-NEXT:  .LBB16_25:
 ; RV32I-NEXT:    sub ra, a1, a4
 ; RV32I-NEXT:    mv a7, t1
@@ -6834,15 +6800,15 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    bltu ra, t4, .LBB16_29
 ; RV32I-NEXT:  # %bb.28:
 ; RV32I-NEXT:    srl a1, t2, ra
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    bnez ra, .LBB16_30
 ; RV32I-NEXT:    j .LBB16_31
 ; RV32I-NEXT:  .LBB16_29:
 ; RV32I-NEXT:    or a1, s0, s2
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    beqz ra, .LBB16_31
 ; RV32I-NEXT:  .LBB16_30:
-; RV32I-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a1
 ; RV32I-NEXT:  .LBB16_31:
 ; RV32I-NEXT:    bltu ra, t4, .LBB16_33
 ; RV32I-NEXT:  # %bb.32:
@@ -6852,7 +6818,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    bnez ra, .LBB16_34
 ; RV32I-NEXT:    j .LBB16_35
 ; RV32I-NEXT:  .LBB16_33:
-; RV32I-NEXT:    srl a1, t2, s10
+; RV32I-NEXT:    srl a1, t2, s5
 ; RV32I-NEXT:    sw a1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sll a1, a5, s1
 ; RV32I-NEXT:    or a1, t5, a1
@@ -6873,7 +6839,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    j .LBB16_40
 ; RV32I-NEXT:  .LBB16_38:
-; RV32I-NEXT:    srl a1, a5, s10
+; RV32I-NEXT:    srl a1, a5, s5
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s3, t4, .LBB16_37
 ; RV32I-NEXT:  .LBB16_39:
@@ -6886,35 +6852,33 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  # %bb.41:
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:  .LBB16_42:
-; RV32I-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s5, a7
+; RV32I-NEXT:    sw t0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    bltu s4, t4, .LBB16_44
 ; RV32I-NEXT:  # %bb.43:
-; RV32I-NEXT:    srl t0, t2, s4
+; RV32I-NEXT:    srl a7, t2, s4
 ; RV32I-NEXT:    j .LBB16_45
 ; RV32I-NEXT:  .LBB16_44:
 ; RV32I-NEXT:    srl a1, t1, ra
-; RV32I-NEXT:    neg t0, s4
-; RV32I-NEXT:    sll t0, t2, t0
-; RV32I-NEXT:    or t0, a1, t0
+; RV32I-NEXT:    neg a7, s4
+; RV32I-NEXT:    sll a7, t2, a7
+; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:  .LBB16_45:
-; RV32I-NEXT:    mv s0, s10
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    sw s10, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li s0, 64
+; RV32I-NEXT:    lbu t6, 19(a0)
 ; RV32I-NEXT:    lbu a1, 23(a0)
 ; RV32I-NEXT:    mv s3, t1
 ; RV32I-NEXT:    beqz s4, .LBB16_47
 ; RV32I-NEXT:  # %bb.46:
-; RV32I-NEXT:    mv s3, t0
+; RV32I-NEXT:    mv s3, a7
 ; RV32I-NEXT:  .LBB16_47:
-; RV32I-NEXT:    mv a6, a3
-; RV32I-NEXT:    lbu s10, 17(a0)
-; RV32I-NEXT:    lbu t0, 18(a0)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu t6, 22(a0)
-; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    lbu s8, 22(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    li a3, 64
 ; RV32I-NEXT:    bltu s4, t4, .LBB16_49
 ; RV32I-NEXT:  # %bb.48:
 ; RV32I-NEXT:    li s4, 0
@@ -6922,45 +6886,41 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  .LBB16_49:
 ; RV32I-NEXT:    srl s4, t2, ra
 ; RV32I-NEXT:  .LBB16_50:
-; RV32I-NEXT:    or s11, s8, t0
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or s10, t6, a7
+; RV32I-NEXT:    lbu a7, 16(a0)
+; RV32I-NEXT:    lbu t6, 20(a0)
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t6, a1, t6
-; RV32I-NEXT:    bgeu ra, a3, .LBB16_52
+; RV32I-NEXT:    or s8, a1, s8
+; RV32I-NEXT:    bgeu ra, s0, .LBB16_52
 ; RV32I-NEXT:  # %bb.51:
 ; RV32I-NEXT:    or s3, t5, s1
 ; RV32I-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    or s4, a1, s2
 ; RV32I-NEXT:  .LBB16_52:
-; RV32I-NEXT:    or a1, s10, t0
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    or t0, s9, s8
-; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or a1, s11, a7
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    or a7, s9, t6
+; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    mv t5, t3
-; RV32I-NEXT:    mv s1, a5
-; RV32I-NEXT:    mv a3, a6
+; RV32I-NEXT:    mv t6, a5
 ; RV32I-NEXT:    beqz ra, .LBB16_54
 ; RV32I-NEXT:  # %bb.53:
 ; RV32I-NEXT:    mv t5, s3
-; RV32I-NEXT:    mv s1, s4
+; RV32I-NEXT:    mv t6, s4
 ; RV32I-NEXT:  .LBB16_54:
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or s2, s11, a1
-; RV32I-NEXT:    or s1, t6, t0
+; RV32I-NEXT:    or s2, s10, a1
+; RV32I-NEXT:    or s1, s8, a7
 ; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    mv a6, a7
-; RV32I-NEXT:    mv a7, s0
 ; RV32I-NEXT:    bltu ra, a1, .LBB16_56
 ; RV32I-NEXT:  # %bb.55:
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw zero, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB16_56:
-; RV32I-NEXT:    srl s3, s2, a7
-; RV32I-NEXT:    sll ra, s1, a4
-; RV32I-NEXT:    mv a7, s5
+; RV32I-NEXT:    srl s3, s2, s5
+; RV32I-NEXT:    sll s0, s1, a4
 ; RV32I-NEXT:    sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t6, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB16_58
 ; RV32I-NEXT:  # %bb.57:
 ; RV32I-NEXT:    sw zero, 32(sp) # 4-byte Folded Spill
@@ -6969,54 +6929,54 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  .LBB16_58:
 ; RV32I-NEXT:    sll a1, s2, a4
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a1, s3, ra
+; RV32I-NEXT:    or a1, s3, s0
 ; RV32I-NEXT:  .LBB16_59:
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
 ; RV32I-NEXT:    lbu t6, 31(a0)
 ; RV32I-NEXT:    mv t5, s1
 ; RV32I-NEXT:    beqz a4, .LBB16_61
 ; RV32I-NEXT:  # %bb.60:
 ; RV32I-NEXT:    mv t5, a1
 ; RV32I-NEXT:  .LBB16_61:
-; RV32I-NEXT:    lbu s8, 25(a0)
-; RV32I-NEXT:    lbu s4, 26(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
-; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s10, 29(a0)
+; RV32I-NEXT:    lbu s4, 30(a0)
 ; RV32I-NEXT:    slli t6, t6, 8
 ; RV32I-NEXT:    bltu s6, t4, .LBB16_63
 ; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    srl t0, s1, s6
+; RV32I-NEXT:    srl a7, s1, s6
 ; RV32I-NEXT:    j .LBB16_64
 ; RV32I-NEXT:  .LBB16_63:
 ; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sll a1, s1, a1
-; RV32I-NEXT:    or t0, s3, a1
+; RV32I-NEXT:    or a7, s3, a1
 ; RV32I-NEXT:  .LBB16_64:
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    lbu s3, 24(a0)
-; RV32I-NEXT:    lbu a1, 28(a0)
-; RV32I-NEXT:    or s4, s9, s4
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t6, t6, s10
+; RV32I-NEXT:    slli s3, s9, 8
+; RV32I-NEXT:    or a1, s11, s8
+; RV32I-NEXT:    lbu s11, 24(a0)
+; RV32I-NEXT:    lbu s8, 28(a0)
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t6, t6, s4
 ; RV32I-NEXT:    mv s9, s2
 ; RV32I-NEXT:    beqz s6, .LBB16_66
 ; RV32I-NEXT:  # %bb.65:
-; RV32I-NEXT:    mv s9, t0
+; RV32I-NEXT:    mv s9, a7
 ; RV32I-NEXT:  .LBB16_66:
-; RV32I-NEXT:    or a0, s8, s3
-; RV32I-NEXT:    slli t0, s4, 16
-; RV32I-NEXT:    or a1, s11, a1
+; RV32I-NEXT:    or a0, s3, s11
+; RV32I-NEXT:    slli a7, a1, 16
+; RV32I-NEXT:    or a1, s10, s8
 ; RV32I-NEXT:    slli t6, t6, 16
 ; RV32I-NEXT:    bltu s6, t4, .LBB16_68
 ; RV32I-NEXT:  # %bb.67:
 ; RV32I-NEXT:    li s4, 0
 ; RV32I-NEXT:    j .LBB16_69
 ; RV32I-NEXT:  .LBB16_68:
-; RV32I-NEXT:    srl s4, s1, s0
+; RV32I-NEXT:    srl s4, s1, s5
 ; RV32I-NEXT:  .LBB16_69:
 ; RV32I-NEXT:    li s11, 64
-; RV32I-NEXT:    or s6, t0, a0
+; RV32I-NEXT:    or s6, a7, a0
 ; RV32I-NEXT:    or a0, t6, a1
 ; RV32I-NEXT:    bltu a4, t4, .LBB16_71
 ; RV32I-NEXT:  # %bb.70:
@@ -7027,9 +6987,9 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    j .LBB16_73
 ; RV32I-NEXT:  .LBB16_71:
 ; RV32I-NEXT:    sll s3, s6, a4
-; RV32I-NEXT:    srl a1, s6, s0
-; RV32I-NEXT:    sll t0, a0, a4
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    srl a1, s6, s5
+; RV32I-NEXT:    sll a7, a0, a4
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    mv s10, a0
 ; RV32I-NEXT:    beqz a4, .LBB16_73
 ; RV32I-NEXT:  .LBB16_72:
@@ -7046,7 +7006,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sll s5, s2, a4
 ; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    srl a1, s2, a1
-; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:    beqz s7, .LBB16_77
 ; RV32I-NEXT:  .LBB16_76:
@@ -7110,8 +7070,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  .LBB16_93:
 ; RV32I-NEXT:    sll s10, t1, a4
 ; RV32I-NEXT:    srl a1, t1, s3
-; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    j .LBB16_96
 ; RV32I-NEXT:  .LBB16_94:
 ; RV32I-NEXT:    srl s4, a5, s3
@@ -7137,8 +7097,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sll t4, t3, s9
 ; RV32I-NEXT:    neg a1, s11
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    sll t0, a5, s9
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    sll a7, a5, s9
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    beqz s11, .LBB16_102
 ; RV32I-NEXT:  .LBB16_101:
 ; RV32I-NEXT:    mv a5, a1
@@ -7163,7 +7123,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:  # %bb.107:
 ; RV32I-NEXT:    li ra, 0
 ; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    li a6, 0
 ; RV32I-NEXT:    bnez a4, .LBB16_109
 ; RV32I-NEXT:    j .LBB16_110
@@ -7190,8 +7150,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    srli t1, ra, 24
 ; RV32I-NEXT:    srli a5, a3, 16
 ; RV32I-NEXT:    srli t4, a3, 24
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    srli s0, a7, 24
+; RV32I-NEXT:    srli a7, t0, 16
+; RV32I-NEXT:    srli s0, t0, 24
 ; RV32I-NEXT:    srli t3, a6, 16
 ; RV32I-NEXT:    srli s3, a6, 24
 ; RV32I-NEXT:    srli t6, s2, 16
@@ -7210,7 +7170,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sb s10, 1(a2)
 ; RV32I-NEXT:    sb a4, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    and a4, a7, t2
+; RV32I-NEXT:    and a4, t0, t2
 ; RV32I-NEXT:    srli t1, s11, 8
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb t1, 5(a2)
@@ -7218,9 +7178,9 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sb t4, 7(a2)
 ; RV32I-NEXT:    and a3, a6, t2
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
 ; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    sb a7, 10(a2)
 ; RV32I-NEXT:    sb s0, 11(a2)
 ; RV32I-NEXT:    and a4, s2, t2
 ; RV32I-NEXT:    srli a3, a3, 8
@@ -7612,54 +7572,54 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    slli t3, t3, 8
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
 ; RV32I-NEXT:    lbu t3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    li s9, 64
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    li t4, 32
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    or t3, a5, a4
-; RV32I-NEXT:    or a5, t2, a7
-; RV32I-NEXT:    or a4, a1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a4, a1, t1
 ; RV32I-NEXT:    slli a4, a4, 6
-; RV32I-NEXT:    neg s10, a4
-; RV32I-NEXT:    srl t5, t3, s10
-; RV32I-NEXT:    sll s5, a5, a4
+; RV32I-NEXT:    neg s5, a4
+; RV32I-NEXT:    srl t5, t3, s5
+; RV32I-NEXT:    sll s10, a5, a4
 ; RV32I-NEXT:    bltu a4, t4, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li s8, 0
-; RV32I-NEXT:    sll a7, t3, a4
+; RV32I-NEXT:    sll t0, t3, a4
 ; RV32I-NEXT:    j .LBB17_3
 ; RV32I-NEXT:  .LBB17_2:
 ; RV32I-NEXT:    sll s8, t3, a4
-; RV32I-NEXT:    or a7, t5, s5
+; RV32I-NEXT:    or t0, t5, s10
 ; RV32I-NEXT:  .LBB17_3:
+; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    lbu t2, 9(a0)
-; RV32I-NEXT:    lbu a1, 10(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
 ; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    lbu a1, 14(a0)
 ; RV32I-NEXT:    slli t6, a3, 8
 ; RV32I-NEXT:    sub s6, s9, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a4, .LBB17_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a3, t0
 ; RV32I-NEXT:  .LBB17_5:
-; RV32I-NEXT:    slli a7, t2, 8
-; RV32I-NEXT:    or a6, a6, a1
+; RV32I-NEXT:    slli t0, t2, 8
+; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    lbu t2, 8(a0)
-; RV32I-NEXT:    lbu a1, 12(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    or a1, t6, a1
 ; RV32I-NEXT:    neg t6, s6
 ; RV32I-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s6, t4, .LBB17_7
@@ -7670,25 +7630,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sll t6, a5, t6
 ; RV32I-NEXT:    or t6, t5, t6
 ; RV32I-NEXT:  .LBB17_8:
-; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    or t0, t0, t2
 ; RV32I-NEXT:    slli t2, a6, 16
-; RV32I-NEXT:    or a1, t1, a1
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    mv a6, t3
 ; RV32I-NEXT:    beqz s6, .LBB17_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a6, t6
 ; RV32I-NEXT:  .LBB17_10:
-; RV32I-NEXT:    or t1, t2, a7
-; RV32I-NEXT:    or t2, t0, a1
+; RV32I-NEXT:    or t1, t2, t0
+; RV32I-NEXT:    or t2, a1, a7
 ; RV32I-NEXT:    bltu s6, t4, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
-; RV32I-NEXT:    srl a7, a5, s10
+; RV32I-NEXT:    srl t0, a5, s5
 ; RV32I-NEXT:  .LBB17_13:
-; RV32I-NEXT:    srl s0, t1, s10
+; RV32I-NEXT:    srl s0, t1, s5
 ; RV32I-NEXT:    sll a1, t2, a4
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB17_15
@@ -7718,7 +7678,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  .LBB17_20:
 ; RV32I-NEXT:    sll s2, t3, a4
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    or a1, a1, s5
+; RV32I-NEXT:    or a1, a1, s10
 ; RV32I-NEXT:    mv s4, a5
 ; RV32I-NEXT:    beqz s7, .LBB17_22
 ; RV32I-NEXT:  .LBB17_21:
@@ -7733,7 +7693,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  .LBB17_24:
 ; RV32I-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or s2, a6, s1
-; RV32I-NEXT:    or s4, a7, s3
+; RV32I-NEXT:    or s4, t0, s3
 ; RV32I-NEXT:  .LBB17_25:
 ; RV32I-NEXT:    sub ra, a1, a4
 ; RV32I-NEXT:    mv a7, t1
@@ -7748,15 +7708,15 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    bltu ra, t4, .LBB17_29
 ; RV32I-NEXT:  # %bb.28:
 ; RV32I-NEXT:    srl a1, t2, ra
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    bnez ra, .LBB17_30
 ; RV32I-NEXT:    j .LBB17_31
 ; RV32I-NEXT:  .LBB17_29:
 ; RV32I-NEXT:    or a1, s0, s2
-; RV32I-NEXT:    sw t1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t1
 ; RV32I-NEXT:    beqz ra, .LBB17_31
 ; RV32I-NEXT:  .LBB17_30:
-; RV32I-NEXT:    sw a1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a1
 ; RV32I-NEXT:  .LBB17_31:
 ; RV32I-NEXT:    bltu ra, t4, .LBB17_33
 ; RV32I-NEXT:  # %bb.32:
@@ -7766,7 +7726,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    bnez ra, .LBB17_34
 ; RV32I-NEXT:    j .LBB17_35
 ; RV32I-NEXT:  .LBB17_33:
-; RV32I-NEXT:    srl a1, t2, s10
+; RV32I-NEXT:    srl a1, t2, s5
 ; RV32I-NEXT:    sw a1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sll a1, a5, s1
 ; RV32I-NEXT:    or a1, t5, a1
@@ -7787,7 +7747,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    j .LBB17_40
 ; RV32I-NEXT:  .LBB17_38:
-; RV32I-NEXT:    srl a1, a5, s10
+; RV32I-NEXT:    srl a1, a5, s5
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s3, t4, .LBB17_37
 ; RV32I-NEXT:  .LBB17_39:
@@ -7800,35 +7760,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  # %bb.41:
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:  .LBB17_42:
-; RV32I-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s5, a7
+; RV32I-NEXT:    sw t0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    bltu s4, t4, .LBB17_44
 ; RV32I-NEXT:  # %bb.43:
-; RV32I-NEXT:    srl t0, t2, s4
+; RV32I-NEXT:    srl a7, t2, s4
 ; RV32I-NEXT:    j .LBB17_45
 ; RV32I-NEXT:  .LBB17_44:
 ; RV32I-NEXT:    srl a1, t1, ra
-; RV32I-NEXT:    neg t0, s4
-; RV32I-NEXT:    sll t0, t2, t0
-; RV32I-NEXT:    or t0, a1, t0
+; RV32I-NEXT:    neg a7, s4
+; RV32I-NEXT:    sll a7, t2, a7
+; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:  .LBB17_45:
-; RV32I-NEXT:    mv s0, s10
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    sw s10, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li s0, 64
+; RV32I-NEXT:    lbu t6, 19(a0)
 ; RV32I-NEXT:    lbu a1, 23(a0)
 ; RV32I-NEXT:    mv s3, t1
 ; RV32I-NEXT:    beqz s4, .LBB17_47
 ; RV32I-NEXT:  # %bb.46:
-; RV32I-NEXT:    mv s3, t0
+; RV32I-NEXT:    mv s3, a7
 ; RV32I-NEXT:  .LBB17_47:
-; RV32I-NEXT:    mv a6, a3
-; RV32I-NEXT:    lbu s10, 17(a0)
-; RV32I-NEXT:    lbu t0, 18(a0)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu t6, 22(a0)
-; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    lbu s8, 22(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    li a3, 64
 ; RV32I-NEXT:    bltu s4, t4, .LBB17_49
 ; RV32I-NEXT:  # %bb.48:
 ; RV32I-NEXT:    li s4, 0
@@ -7836,45 +7794,41 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  .LBB17_49:
 ; RV32I-NEXT:    srl s4, t2, ra
 ; RV32I-NEXT:  .LBB17_50:
-; RV32I-NEXT:    or s11, s8, t0
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or s10, t6, a7
+; RV32I-NEXT:    lbu a7, 16(a0)
+; RV32I-NEXT:    lbu t6, 20(a0)
 ; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    or t6, a1, t6
-; RV32I-NEXT:    bgeu ra, a3, .LBB17_52
+; RV32I-NEXT:    or s8, a1, s8
+; RV32I-NEXT:    bgeu ra, s0, .LBB17_52
 ; RV32I-NEXT:  # %bb.51:
 ; RV32I-NEXT:    or s3, t5, s1
 ; RV32I-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    or s4, a1, s2
 ; RV32I-NEXT:  .LBB17_52:
-; RV32I-NEXT:    or a1, s10, t0
-; RV32I-NEXT:    slli s11, s11, 16
-; RV32I-NEXT:    or t0, s9, s8
-; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or a1, s11, a7
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    or a7, s9, t6
+; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    mv t5, t3
-; RV32I-NEXT:    mv s1, a5
-; RV32I-NEXT:    mv a3, a6
+; RV32I-NEXT:    mv t6, a5
 ; RV32I-NEXT:    beqz ra, .LBB17_54
 ; RV32I-NEXT:  # %bb.53:
 ; RV32I-NEXT:    mv t5, s3
-; RV32I-NEXT:    mv s1, s4
+; RV32I-NEXT:    mv t6, s4
 ; RV32I-NEXT:  .LBB17_54:
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or s2, s11, a1
-; RV32I-NEXT:    or s1, t6, t0
+; RV32I-NEXT:    or s2, s10, a1
+; RV32I-NEXT:    or s1, s8, a7
 ; RV32I-NEXT:    li a1, 64
-; RV32I-NEXT:    mv a6, a7
-; RV32I-NEXT:    mv a7, s0
 ; RV32I-NEXT:    bltu ra, a1, .LBB17_56
 ; RV32I-NEXT:  # %bb.55:
 ; RV32I-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw zero, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB17_56:
-; RV32I-NEXT:    srl s3, s2, a7
-; RV32I-NEXT:    sll ra, s1, a4
-; RV32I-NEXT:    mv a7, s5
+; RV32I-NEXT:    srl s3, s2, s5
+; RV32I-NEXT:    sll s0, s1, a4
 ; RV32I-NEXT:    sw t5, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t6, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu a4, t4, .LBB17_58
 ; RV32I-NEXT:  # %bb.57:
 ; RV32I-NEXT:    sw zero, 32(sp) # 4-byte Folded Spill
@@ -7883,54 +7837,54 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  .LBB17_58:
 ; RV32I-NEXT:    sll a1, s2, a4
 ; RV32I-NEXT:    sw a1, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    or a1, s3, ra
+; RV32I-NEXT:    or a1, s3, s0
 ; RV32I-NEXT:  .LBB17_59:
-; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
 ; RV32I-NEXT:    lbu t6, 31(a0)
 ; RV32I-NEXT:    mv t5, s1
 ; RV32I-NEXT:    beqz a4, .LBB17_61
 ; RV32I-NEXT:  # %bb.60:
 ; RV32I-NEXT:    mv t5, a1
 ; RV32I-NEXT:  .LBB17_61:
-; RV32I-NEXT:    lbu s8, 25(a0)
-; RV32I-NEXT:    lbu s4, 26(a0)
-; RV32I-NEXT:    lbu s11, 29(a0)
-; RV32I-NEXT:    lbu s10, 30(a0)
-; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s10, 29(a0)
+; RV32I-NEXT:    lbu s4, 30(a0)
 ; RV32I-NEXT:    slli t6, t6, 8
 ; RV32I-NEXT:    bltu s6, t4, .LBB17_63
 ; RV32I-NEXT:  # %bb.62:
-; RV32I-NEXT:    srl t0, s1, s6
+; RV32I-NEXT:    srl a7, s1, s6
 ; RV32I-NEXT:    j .LBB17_64
 ; RV32I-NEXT:  .LBB17_63:
 ; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sll a1, s1, a1
-; RV32I-NEXT:    or t0, s3, a1
+; RV32I-NEXT:    or a7, s3, a1
 ; RV32I-NEXT:  .LBB17_64:
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    lbu s3, 24(a0)
-; RV32I-NEXT:    lbu a1, 28(a0)
-; RV32I-NEXT:    or s4, s9, s4
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t6, t6, s10
+; RV32I-NEXT:    slli s3, s9, 8
+; RV32I-NEXT:    or a1, s11, s8
+; RV32I-NEXT:    lbu s11, 24(a0)
+; RV32I-NEXT:    lbu s8, 28(a0)
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t6, t6, s4
 ; RV32I-NEXT:    mv s9, s2
 ; RV32I-NEXT:    beqz s6, .LBB17_66
 ; RV32I-NEXT:  # %bb.65:
-; RV32I-NEXT:    mv s9, t0
+; RV32I-NEXT:    mv s9, a7
 ; RV32I-NEXT:  .LBB17_66:
-; RV32I-NEXT:    or a0, s8, s3
-; RV32I-NEXT:    slli t0, s4, 16
-; RV32I-NEXT:    or a1, s11, a1
+; RV32I-NEXT:    or a0, s3, s11
+; RV32I-NEXT:    slli a7, a1, 16
+; RV32I-NEXT:    or a1, s10, s8
 ; RV32I-NEXT:    slli t6, t6, 16
 ; RV32I-NEXT:    bltu s6, t4, .LBB17_68
 ; RV32I-NEXT:  # %bb.67:
 ; RV32I-NEXT:    li s4, 0
 ; RV32I-NEXT:    j .LBB17_69
 ; RV32I-NEXT:  .LBB17_68:
-; RV32I-NEXT:    srl s4, s1, s0
+; RV32I-NEXT:    srl s4, s1, s5
 ; RV32I-NEXT:  .LBB17_69:
 ; RV32I-NEXT:    li s11, 64
-; RV32I-NEXT:    or s6, t0, a0
+; RV32I-NEXT:    or s6, a7, a0
 ; RV32I-NEXT:    or a0, t6, a1
 ; RV32I-NEXT:    bltu a4, t4, .LBB17_71
 ; RV32I-NEXT:  # %bb.70:
@@ -7941,9 +7895,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    j .LBB17_73
 ; RV32I-NEXT:  .LBB17_71:
 ; RV32I-NEXT:    sll s3, s6, a4
-; RV32I-NEXT:    srl a1, s6, s0
-; RV32I-NEXT:    sll t0, a0, a4
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    srl a1, s6, s5
+; RV32I-NEXT:    sll a7, a0, a4
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    mv s10, a0
 ; RV32I-NEXT:    beqz a4, .LBB17_73
 ; RV32I-NEXT:  .LBB17_72:
@@ -7960,7 +7914,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sll s5, s2, a4
 ; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    srl a1, s2, a1
-; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    mv s0, s1
 ; RV32I-NEXT:    beqz s7, .LBB17_77
 ; RV32I-NEXT:  .LBB17_76:
@@ -8024,8 +7978,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  .LBB17_93:
 ; RV32I-NEXT:    sll s10, t1, a4
 ; RV32I-NEXT:    srl a1, t1, s3
-; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    j .LBB17_96
 ; RV32I-NEXT:  .LBB17_94:
 ; RV32I-NEXT:    srl s4, a5, s3
@@ -8051,8 +8005,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sll t4, t3, s9
 ; RV32I-NEXT:    neg a1, s11
 ; RV32I-NEXT:    srl a1, t3, a1
-; RV32I-NEXT:    sll t0, a5, s9
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    sll a7, a5, s9
+; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    beqz s11, .LBB17_102
 ; RV32I-NEXT:  .LBB17_101:
 ; RV32I-NEXT:    mv a5, a1
@@ -8077,7 +8031,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:  # %bb.107:
 ; RV32I-NEXT:    li ra, 0
 ; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    li a7, 0
+; RV32I-NEXT:    li t0, 0
 ; RV32I-NEXT:    li a6, 0
 ; RV32I-NEXT:    bnez a4, .LBB17_109
 ; RV32I-NEXT:    j .LBB17_110
@@ -8104,8 +8058,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    srli t1, ra, 24
 ; RV32I-NEXT:    srli a5, a3, 16
 ; RV32I-NEXT:    srli t4, a3, 24
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    srli s0, a7, 24
+; RV32I-NEXT:    srli a7, t0, 16
+; RV32I-NEXT:    srli s0, t0, 24
 ; RV32I-NEXT:    srli t3, a6, 16
 ; RV32I-NEXT:    srli s3, a6, 24
 ; RV32I-NEXT:    srli t6, s2, 16
@@ -8124,7 +8078,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sb s10, 1(a2)
 ; RV32I-NEXT:    sb a4, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    and a4, a7, t2
+; RV32I-NEXT:    and a4, t0, t2
 ; RV32I-NEXT:    srli t1, s11, 8
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb t1, 5(a2)
@@ -8132,9 +8086,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sb t4, 7(a2)
 ; RV32I-NEXT:    and a3, a6, t2
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t0, 8(a2)
 ; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    sb a7, 10(a2)
 ; RV32I-NEXT:    sb s0, 11(a2)
 ; RV32I-NEXT:    and a4, s2, t2
 ; RV32I-NEXT:    srli a3, a3, 8
@@ -8227,88 +8181,88 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t4, s0, t5
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t4, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li t0, 64
 ; RV64I-NEXT:    slli t3, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    or a5, t5, t4
-; RV64I-NEXT:    or a6, s0, t6
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t4, t4, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a7, a7, s0
+; RV64I-NEXT:    or a5, t4, t1
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    slli a6, a6, 3
 ; RV64I-NEXT:    subw t1, a6, t0
 ; RV64I-NEXT:    negw t5, a6
@@ -8522,47 +8476,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, t2, t1
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or t4, t6, t5
-; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or s0, t5, a7
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, a7
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t6, a1, t5
 ; RV32I-NEXT:    li t5, 32
-; RV32I-NEXT:    slli a7, a4, 16
-; RV32I-NEXT:    slli a1, a5, 16
+; RV32I-NEXT:    slli a7, a5, 16
+; RV32I-NEXT:    slli a1, t0, 16
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a5, t2, 16
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or a4, t1, t0
-; RV32I-NEXT:    or a5, a5, s0
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or t2, t4, a4
+; RV32I-NEXT:    or a4, a5, t1
+; RV32I-NEXT:    or a5, t6, t3
 ; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    srl s0, t2, a5
 ; RV32I-NEXT:    neg s6, a5
@@ -8628,6 +8582,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB18_18:
 ; RV32I-NEXT:    neg s11, s9
 ; RV32I-NEXT:    sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t5, .LBB18_20
 ; RV32I-NEXT:  # %bb.19:
 ; RV32I-NEXT:    sra s0, a4, s9
@@ -8636,20 +8591,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sll a3, a4, s11
 ; RV32I-NEXT:    or s0, s0, a3
 ; RV32I-NEXT:  .LBB18_21:
-; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t4, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beqz s9, .LBB18_23
 ; RV32I-NEXT:  # %bb.22:
 ; RV32I-NEXT:    mv t4, s0
 ; RV32I-NEXT:  .LBB18_23:
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    lbu s2, 9(a0)
-; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s3, 10(a0)
 ; RV32I-NEXT:    lbu s8, 13(a0)
 ; RV32I-NEXT:    lbu ra, 14(a0)
-; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    bltu s9, t5, .LBB18_25
 ; RV32I-NEXT:  # %bb.24:
@@ -8658,12 +8612,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB18_25:
 ; RV32I-NEXT:    sra s0, a4, a5
 ; RV32I-NEXT:  .LBB18_26:
-; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    or s1, s1, s3
 ; RV32I-NEXT:    lbu s5, 8(a0)
 ; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s8, 8
-; RV32I-NEXT:    or s8, a3, ra
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a5, t6, .LBB18_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or t4, t0, a6
@@ -8673,8 +8627,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t3, 7(a0)
 ; RV32I-NEXT:    or a6, s2, s5
 ; RV32I-NEXT:    slli s2, s1, 16
-; RV32I-NEXT:    or s1, s4, s3
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    or s1, s8, s3
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    beqz a5, .LBB18_30
@@ -8682,26 +8636,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    mv a1, t4
 ; RV32I-NEXT:    mv t0, s0
 ; RV32I-NEXT:  .LBB18_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
-; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    slli s8, a3, 8
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu s4, 2(a0)
 ; RV32I-NEXT:    lbu s3, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s4, t3, 8
+; RV32I-NEXT:    slli s5, t3, 8
 ; RV32I-NEXT:    or t4, s2, a6
-; RV32I-NEXT:    or t3, s8, s1
+; RV32I-NEXT:    or t3, ra, s1
 ; RV32I-NEXT:    bltu a5, t6, .LBB18_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    srai a6, a4, 31
 ; RV32I-NEXT:    sw a6, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw a6, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB18_32:
-; RV32I-NEXT:    slli a6, ra, 8
-; RV32I-NEXT:    or a3, s5, a3
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a6, s8, s4
 ; RV32I-NEXT:    lbu s1, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s0, s4, s0
+; RV32I-NEXT:    or s0, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a5
 ; RV32I-NEXT:    sll ra, t3, s6
 ; RV32I-NEXT:    bltu a5, t5, .LBB18_34
@@ -8711,8 +8665,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  .LBB18_34:
 ; RV32I-NEXT:    or s4, s2, ra
 ; RV32I-NEXT:  .LBB18_35:
-; RV32I-NEXT:    or a6, a6, s1
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    or a3, a3, s1
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    or a0, s3, a0
 ; RV32I-NEXT:    slli s1, s0, 16
 ; RV32I-NEXT:    mv s5, t4
@@ -8720,7 +8674,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:  # %bb.36:
 ; RV32I-NEXT:    mv s5, s4
 ; RV32I-NEXT:  .LBB18_37:
-; RV32I-NEXT:    or s0, a3, a6
+; RV32I-NEXT:    or s0, a6, a3
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    bltu a5, t5, .LBB18_39
 ; RV32I-NEXT:  # %bb.38:
@@ -9158,88 +9112,88 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t4, s0, t5
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t4, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li t0, 64
 ; RV64I-NEXT:    slli t3, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    or a5, t5, t4
-; RV64I-NEXT:    or a6, s0, t6
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t4, t4, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a7, a7, s0
+; RV64I-NEXT:    or a5, t4, t1
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    slli a6, a6, 5
 ; RV64I-NEXT:    subw t1, a6, t0
 ; RV64I-NEXT:    negw t5, a6
@@ -9453,47 +9407,47 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, t2, t1
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or t4, t6, t5
-; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or s0, t5, a7
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, a7
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t6, a1, t5
 ; RV32I-NEXT:    li t5, 32
-; RV32I-NEXT:    slli a7, a4, 16
-; RV32I-NEXT:    slli a1, a5, 16
+; RV32I-NEXT:    slli a7, a5, 16
+; RV32I-NEXT:    slli a1, t0, 16
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a5, t2, 16
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or a4, t1, t0
-; RV32I-NEXT:    or a5, a5, s0
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or t2, t4, a4
+; RV32I-NEXT:    or a4, a5, t1
+; RV32I-NEXT:    or a5, t6, t3
 ; RV32I-NEXT:    slli a5, a5, 5
 ; RV32I-NEXT:    srl s0, t2, a5
 ; RV32I-NEXT:    neg s6, a5
@@ -9559,6 +9513,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB19_18:
 ; RV32I-NEXT:    neg s11, s9
 ; RV32I-NEXT:    sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t5, .LBB19_20
 ; RV32I-NEXT:  # %bb.19:
 ; RV32I-NEXT:    sra s0, a4, s9
@@ -9567,20 +9522,19 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sll a3, a4, s11
 ; RV32I-NEXT:    or s0, s0, a3
 ; RV32I-NEXT:  .LBB19_21:
-; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t4, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beqz s9, .LBB19_23
 ; RV32I-NEXT:  # %bb.22:
 ; RV32I-NEXT:    mv t4, s0
 ; RV32I-NEXT:  .LBB19_23:
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    lbu s2, 9(a0)
-; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s3, 10(a0)
 ; RV32I-NEXT:    lbu s8, 13(a0)
 ; RV32I-NEXT:    lbu ra, 14(a0)
-; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    bltu s9, t5, .LBB19_25
 ; RV32I-NEXT:  # %bb.24:
@@ -9589,12 +9543,12 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB19_25:
 ; RV32I-NEXT:    sra s0, a4, a5
 ; RV32I-NEXT:  .LBB19_26:
-; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    or s1, s1, s3
 ; RV32I-NEXT:    lbu s5, 8(a0)
 ; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s8, 8
-; RV32I-NEXT:    or s8, a3, ra
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a5, t6, .LBB19_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or t4, t0, a6
@@ -9604,8 +9558,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    lbu t3, 7(a0)
 ; RV32I-NEXT:    or a6, s2, s5
 ; RV32I-NEXT:    slli s2, s1, 16
-; RV32I-NEXT:    or s1, s4, s3
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    or s1, s8, s3
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    beqz a5, .LBB19_30
@@ -9613,26 +9567,26 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    mv a1, t4
 ; RV32I-NEXT:    mv t0, s0
 ; RV32I-NEXT:  .LBB19_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
-; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    slli s8, a3, 8
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu s4, 2(a0)
 ; RV32I-NEXT:    lbu s3, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s4, t3, 8
+; RV32I-NEXT:    slli s5, t3, 8
 ; RV32I-NEXT:    or t4, s2, a6
-; RV32I-NEXT:    or t3, s8, s1
+; RV32I-NEXT:    or t3, ra, s1
 ; RV32I-NEXT:    bltu a5, t6, .LBB19_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    srai a6, a4, 31
 ; RV32I-NEXT:    sw a6, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw a6, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB19_32:
-; RV32I-NEXT:    slli a6, ra, 8
-; RV32I-NEXT:    or a3, s5, a3
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a6, s8, s4
 ; RV32I-NEXT:    lbu s1, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s0, s4, s0
+; RV32I-NEXT:    or s0, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a5
 ; RV32I-NEXT:    sll ra, t3, s6
 ; RV32I-NEXT:    bltu a5, t5, .LBB19_34
@@ -9642,8 +9596,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  .LBB19_34:
 ; RV32I-NEXT:    or s4, s2, ra
 ; RV32I-NEXT:  .LBB19_35:
-; RV32I-NEXT:    or a6, a6, s1
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    or a3, a3, s1
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    or a0, s3, a0
 ; RV32I-NEXT:    slli s1, s0, 16
 ; RV32I-NEXT:    mv s5, t4
@@ -9651,7 +9605,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:  # %bb.36:
 ; RV32I-NEXT:    mv s5, s4
 ; RV32I-NEXT:  .LBB19_37:
-; RV32I-NEXT:    or s0, a3, a6
+; RV32I-NEXT:    or s0, a6, a3
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    bltu a5, t5, .LBB19_39
 ; RV32I-NEXT:  # %bb.38:
@@ -10089,88 +10043,88 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 21(a0)
 ; RV64I-NEXT:    lbu s10, 22(a0)
 ; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    slli t2, t2, 8
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
 ; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli s3, s3, 8
 ; RV64I-NEXT:    or a4, t4, t3
 ; RV64I-NEXT:    or a6, t6, t5
-; RV64I-NEXT:    or t0, s1, s0
-; RV64I-NEXT:    lbu t5, 24(a0)
-; RV64I-NEXT:    lbu t6, 25(a0)
-; RV64I-NEXT:    lbu s0, 26(a0)
-; RV64I-NEXT:    lbu s1, 27(a0)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t3, 24(a0)
+; RV64I-NEXT:    lbu t4, 25(a0)
+; RV64I-NEXT:    lbu t5, 26(a0)
+; RV64I-NEXT:    lbu t6, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or t4, s3, s2
-; RV64I-NEXT:    or t2, s5, s4
-; RV64I-NEXT:    or t3, s7, s6
-; RV64I-NEXT:    lbu s2, 28(a0)
-; RV64I-NEXT:    lbu s3, 29(a0)
-; RV64I-NEXT:    lbu s4, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or s0, s5, s4
+; RV64I-NEXT:    or s1, s7, s6
+; RV64I-NEXT:    or s2, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s5, s9, s8
+; RV64I-NEXT:    slli s4, s4, 8
 ; RV64I-NEXT:    or s6, s11, s10
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t4, t6, t5
+; RV64I-NEXT:    or t5, s4, s3
 ; RV64I-NEXT:    lbu t6, 0(a1)
-; RV64I-NEXT:    lbu s1, 1(a1)
-; RV64I-NEXT:    lbu s7, 2(a1)
-; RV64I-NEXT:    lbu s8, 3(a1)
-; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, a0, s4
-; RV64I-NEXT:    or t6, s1, t6
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s5, a0, s5
+; RV64I-NEXT:    or t6, s3, t6
+; RV64I-NEXT:    or s3, s7, s4
 ; RV64I-NEXT:    lbu a0, 4(a1)
-; RV64I-NEXT:    lbu s1, 5(a1)
-; RV64I-NEXT:    lbu s4, 6(a1)
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s7, s8, s7
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s1, s1, a0
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, a0
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or s4, a1, s4
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a7
-; RV64I-NEXT:    slli t4, t4, 16
-; RV64I-NEXT:    or a0, t4, t0
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    or s7, a1, s7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a1, t0, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    or s0, s1, s0
 ; RV64I-NEXT:    slli s6, s6, 16
-; RV64I-NEXT:    or t1, s6, s5
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    or t4, s0, t5
+; RV64I-NEXT:    or a7, s6, s2
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    or t1, t4, t3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    or t4, s5, t5
 ; RV64I-NEXT:    slli s3, s3, 16
-; RV64I-NEXT:    or t5, s3, s2
+; RV64I-NEXT:    or t5, s3, t6
 ; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    or t6, s7, t6
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    or s0, s4, s1
+; RV64I-NEXT:    or t6, s7, s4
 ; RV64I-NEXT:    li t0, 64
 ; RV64I-NEXT:    slli t3, a5, 16
 ; RV64I-NEXT:    slli t2, a6, 16
-; RV64I-NEXT:    slli t1, t1, 32
-; RV64I-NEXT:    slli t5, t5, 32
-; RV64I-NEXT:    slli s0, s0, 32
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    or a5, t5, t4
-; RV64I-NEXT:    or a6, s0, t6
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t4, t4, 32
+; RV64I-NEXT:    slli t6, t6, 32
+; RV64I-NEXT:    or a7, a7, s0
+; RV64I-NEXT:    or a5, t4, t1
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    slli a6, a6, 6
 ; RV64I-NEXT:    subw t1, a6, t0
 ; RV64I-NEXT:    negw t5, a6
@@ -10384,47 +10338,47 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    lbu t0, 21(a0)
 ; RV32I-NEXT:    lbu t1, 22(a0)
 ; RV32I-NEXT:    lbu t2, 23(a0)
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu t4, 25(a0)
-; RV32I-NEXT:    lbu t5, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, t2, t1
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    lbu a7, 28(a0)
-; RV32I-NEXT:    lbu t0, 29(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
-; RV32I-NEXT:    lbu t2, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or t4, t6, t5
-; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu t1, 29(a0)
+; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    lbu t3, 31(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
-; RV32I-NEXT:    lbu t6, 2(a1)
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or s0, t5, a7
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, a7
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or t2, a1, t6
+; RV32I-NEXT:    or t6, a1, t5
 ; RV32I-NEXT:    li t5, 32
-; RV32I-NEXT:    slli a7, a4, 16
-; RV32I-NEXT:    slli a1, a5, 16
+; RV32I-NEXT:    slli a7, a5, 16
+; RV32I-NEXT:    slli a1, t0, 16
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a5, t2, 16
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    or a4, t1, t0
-; RV32I-NEXT:    or a5, a5, s0
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    or t2, t4, a4
+; RV32I-NEXT:    or a4, a5, t1
+; RV32I-NEXT:    or a5, t6, t3
 ; RV32I-NEXT:    slli a5, a5, 6
 ; RV32I-NEXT:    srl s0, t2, a5
 ; RV32I-NEXT:    neg s6, a5
@@ -10490,6 +10444,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB20_18:
 ; RV32I-NEXT:    neg s11, s9
 ; RV32I-NEXT:    sw s0, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    bltu s9, t5, .LBB20_20
 ; RV32I-NEXT:  # %bb.19:
 ; RV32I-NEXT:    sra s0, a4, s9
@@ -10498,20 +10453,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sll a3, a4, s11
 ; RV32I-NEXT:    or s0, s0, a3
 ; RV32I-NEXT:  .LBB20_21:
-; RV32I-NEXT:    sw s1, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t4, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s3, 11(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu a3, 15(a0)
 ; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beqz s9, .LBB20_23
 ; RV32I-NEXT:  # %bb.22:
 ; RV32I-NEXT:    mv t4, s0
 ; RV32I-NEXT:  .LBB20_23:
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    lbu s2, 9(a0)
-; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s3, 10(a0)
 ; RV32I-NEXT:    lbu s8, 13(a0)
 ; RV32I-NEXT:    lbu ra, 14(a0)
-; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    bltu s9, t5, .LBB20_25
 ; RV32I-NEXT:  # %bb.24:
@@ -10520,12 +10474,12 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB20_25:
 ; RV32I-NEXT:    sra s0, a4, a5
 ; RV32I-NEXT:  .LBB20_26:
-; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    or s1, s1, s3
 ; RV32I-NEXT:    lbu s5, 8(a0)
 ; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    slli s2, s2, 8
-; RV32I-NEXT:    slli s4, s8, 8
-; RV32I-NEXT:    or s8, a3, ra
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or ra, a3, ra
 ; RV32I-NEXT:    bgeu a5, t6, .LBB20_28
 ; RV32I-NEXT:  # %bb.27:
 ; RV32I-NEXT:    or t4, t0, a6
@@ -10535,8 +10489,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    lbu t3, 7(a0)
 ; RV32I-NEXT:    or a6, s2, s5
 ; RV32I-NEXT:    slli s2, s1, 16
-; RV32I-NEXT:    or s1, s4, s3
-; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    or s1, s8, s3
+; RV32I-NEXT:    slli ra, ra, 16
 ; RV32I-NEXT:    mv a1, t1
 ; RV32I-NEXT:    mv t0, a7
 ; RV32I-NEXT:    beqz a5, .LBB20_30
@@ -10544,26 +10498,26 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    mv a1, t4
 ; RV32I-NEXT:    mv t0, s0
 ; RV32I-NEXT:  .LBB20_30:
-; RV32I-NEXT:    slli s5, a3, 8
-; RV32I-NEXT:    lbu ra, 1(a0)
-; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    slli s8, a3, 8
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu s4, 2(a0)
 ; RV32I-NEXT:    lbu s3, 5(a0)
 ; RV32I-NEXT:    lbu s0, 6(a0)
-; RV32I-NEXT:    slli s4, t3, 8
+; RV32I-NEXT:    slli s5, t3, 8
 ; RV32I-NEXT:    or t4, s2, a6
-; RV32I-NEXT:    or t3, s8, s1
+; RV32I-NEXT:    or t3, ra, s1
 ; RV32I-NEXT:    bltu a5, t6, .LBB20_32
 ; RV32I-NEXT:  # %bb.31:
 ; RV32I-NEXT:    srai a6, a4, 31
 ; RV32I-NEXT:    sw a6, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw a6, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:  .LBB20_32:
-; RV32I-NEXT:    slli a6, ra, 8
-; RV32I-NEXT:    or a3, s5, a3
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a6, s8, s4
 ; RV32I-NEXT:    lbu s1, 0(a0)
 ; RV32I-NEXT:    lbu a0, 4(a0)
 ; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s0, s4, s0
+; RV32I-NEXT:    or s0, s5, s0
 ; RV32I-NEXT:    srl s2, t4, a5
 ; RV32I-NEXT:    sll ra, t3, s6
 ; RV32I-NEXT:    bltu a5, t5, .LBB20_34
@@ -10573,8 +10527,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  .LBB20_34:
 ; RV32I-NEXT:    or s4, s2, ra
 ; RV32I-NEXT:  .LBB20_35:
-; RV32I-NEXT:    or a6, a6, s1
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    or a3, a3, s1
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    or a0, s3, a0
 ; RV32I-NEXT:    slli s1, s0, 16
 ; RV32I-NEXT:    mv s5, t4
@@ -10582,7 +10536,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:  # %bb.36:
 ; RV32I-NEXT:    mv s5, s4
 ; RV32I-NEXT:  .LBB20_37:
-; RV32I-NEXT:    or s0, a3, a6
+; RV32I-NEXT:    or s0, a6, a3
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    bltu a5, t5, .LBB20_39
 ; RV32I-NEXT:  # %bb.38:
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index c9a48acb8d14a..d7290e1e65540 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -625,42 +625,42 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
 ; RV32I-NEXT:    lw t0, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t3, t0, a6
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t5, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t2, t1
+; RV32I-NEXT:    slt t5, t2, t1
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    sltu t6, a2, a4
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beq a4, a2, .LBB11_4
+; RV32I-NEXT:    sltu t4, a5, a4
+; RV32I-NEXT:    sltu a2, a1, a3
+; RV32I-NEXT:    mv a7, a2
+; RV32I-NEXT:    beq a4, a5, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv a7, t6
+; RV32I-NEXT:    mv a7, t4
 ; RV32I-NEXT:  .LBB11_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    xor t5, t1, t2
+; RV32I-NEXT:    xor t6, t1, t2
 ; RV32I-NEXT:    xor s0, a6, t0
-; RV32I-NEXT:    or t5, s0, t5
-; RV32I-NEXT:    beqz t5, .LBB11_6
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    beqz t6, .LBB11_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv a7, t4
+; RV32I-NEXT:    mv a7, t5
 ; RV32I-NEXT:  .LBB11_6:
-; RV32I-NEXT:    mv t5, a5
-; RV32I-NEXT:    beq a2, a4, .LBB11_8
+; RV32I-NEXT:    mv t5, a2
+; RV32I-NEXT:    beq a5, a4, .LBB11_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t5, t6
+; RV32I-NEXT:    mv t5, t4
 ; RV32I-NEXT:  .LBB11_8:
 ; RV32I-NEXT:    sltu t4, a3, a1
 ; RV32I-NEXT:    mv t6, t4
-; RV32I-NEXT:    beq a4, a2, .LBB11_10
+; RV32I-NEXT:    beq a4, a5, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t6, a4, a2
+; RV32I-NEXT:    sltu t6, a4, a5
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    bnez a7, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
@@ -684,12 +684,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    add t0, t0, t1
 ; RV32I-NEXT:    bnez a7, .LBB11_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sub a2, a2, a4
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a2, a5, a2
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    j .LBB11_16
 ; RV32I-NEXT:  .LBB11_15:
-; RV32I-NEXT:    sub a4, a4, a2
+; RV32I-NEXT:    sub a4, a4, a5
 ; RV32I-NEXT:    sub a2, a4, t4
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB11_16:
@@ -744,42 +744,42 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
 ; RV32ZBB-NEXT:    lw t0, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
-; RV32ZBB-NEXT:    lw a1, 0(a2)
-; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t3, t0, a6
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t5, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t2, t1
+; RV32ZBB-NEXT:    slt t5, t2, t1
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    sltu t6, a2, a4
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beq a4, a2, .LBB11_4
+; RV32ZBB-NEXT:    sltu t4, a5, a4
+; RV32ZBB-NEXT:    sltu a2, a1, a3
+; RV32ZBB-NEXT:    mv a7, a2
+; RV32ZBB-NEXT:    beq a4, a5, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv a7, t6
+; RV32ZBB-NEXT:    mv a7, t4
 ; RV32ZBB-NEXT:  .LBB11_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBB-NEXT:    xor t5, t1, t2
+; RV32ZBB-NEXT:    xor t6, t1, t2
 ; RV32ZBB-NEXT:    xor s0, a6, t0
-; RV32ZBB-NEXT:    or t5, s0, t5
-; RV32ZBB-NEXT:    beqz t5, .LBB11_6
+; RV32ZBB-NEXT:    or t6, s0, t6
+; RV32ZBB-NEXT:    beqz t6, .LBB11_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv a7, t4
+; RV32ZBB-NEXT:    mv a7, t5
 ; RV32ZBB-NEXT:  .LBB11_6:
-; RV32ZBB-NEXT:    mv t5, a5
-; RV32ZBB-NEXT:    beq a2, a4, .LBB11_8
+; RV32ZBB-NEXT:    mv t5, a2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB11_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t5, t6
+; RV32ZBB-NEXT:    mv t5, t4
 ; RV32ZBB-NEXT:  .LBB11_8:
 ; RV32ZBB-NEXT:    sltu t4, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t4
-; RV32ZBB-NEXT:    beq a4, a2, .LBB11_10
+; RV32ZBB-NEXT:    beq a4, a5, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t6, a4, a2
+; RV32ZBB-NEXT:    sltu t6, a4, a5
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    bnez a7, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
@@ -803,12 +803,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    add t0, t0, t1
 ; RV32ZBB-NEXT:    bnez a7, .LBB11_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sub a2, a2, a4
-; RV32ZBB-NEXT:    sub a2, a2, a5
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a2, a5, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    j .LBB11_16
 ; RV32ZBB-NEXT:  .LBB11_15:
-; RV32ZBB-NEXT:    sub a4, a4, a2
+; RV32ZBB-NEXT:    sub a4, a4, a5
 ; RV32ZBB-NEXT:    sub a2, a4, t4
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB11_16:
@@ -872,42 +872,42 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
 ; RV32I-NEXT:    lw t0, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t3, t0, a6
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t5, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t2, t1
+; RV32I-NEXT:    slt t5, t2, t1
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    sltu t6, a2, a4
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beq a4, a2, .LBB12_4
+; RV32I-NEXT:    sltu t4, a5, a4
+; RV32I-NEXT:    sltu a2, a1, a3
+; RV32I-NEXT:    mv a7, a2
+; RV32I-NEXT:    beq a4, a5, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv a7, t6
+; RV32I-NEXT:    mv a7, t4
 ; RV32I-NEXT:  .LBB12_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    xor t5, t1, t2
+; RV32I-NEXT:    xor t6, t1, t2
 ; RV32I-NEXT:    xor s0, a6, t0
-; RV32I-NEXT:    or t5, s0, t5
-; RV32I-NEXT:    beqz t5, .LBB12_6
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    beqz t6, .LBB12_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv a7, t4
+; RV32I-NEXT:    mv a7, t5
 ; RV32I-NEXT:  .LBB12_6:
-; RV32I-NEXT:    mv t5, a5
-; RV32I-NEXT:    beq a2, a4, .LBB12_8
+; RV32I-NEXT:    mv t5, a2
+; RV32I-NEXT:    beq a5, a4, .LBB12_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t5, t6
+; RV32I-NEXT:    mv t5, t4
 ; RV32I-NEXT:  .LBB12_8:
 ; RV32I-NEXT:    sltu t4, a3, a1
 ; RV32I-NEXT:    mv t6, t4
-; RV32I-NEXT:    beq a4, a2, .LBB12_10
+; RV32I-NEXT:    beq a4, a5, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t6, a4, a2
+; RV32I-NEXT:    sltu t6, a4, a5
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    bnez a7, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
@@ -931,12 +931,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    add t0, t0, t1
 ; RV32I-NEXT:    bnez a7, .LBB12_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sub a2, a2, a4
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a2, a5, a2
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    j .LBB12_16
 ; RV32I-NEXT:  .LBB12_15:
-; RV32I-NEXT:    sub a4, a4, a2
+; RV32I-NEXT:    sub a4, a4, a5
 ; RV32I-NEXT:    sub a2, a4, t4
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB12_16:
@@ -991,42 +991,42 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
 ; RV32ZBB-NEXT:    lw t0, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
-; RV32ZBB-NEXT:    lw a1, 0(a2)
-; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t3, t0, a6
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t5, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t2, t1
+; RV32ZBB-NEXT:    slt t5, t2, t1
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    sltu t6, a2, a4
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beq a4, a2, .LBB12_4
+; RV32ZBB-NEXT:    sltu t4, a5, a4
+; RV32ZBB-NEXT:    sltu a2, a1, a3
+; RV32ZBB-NEXT:    mv a7, a2
+; RV32ZBB-NEXT:    beq a4, a5, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv a7, t6
+; RV32ZBB-NEXT:    mv a7, t4
 ; RV32ZBB-NEXT:  .LBB12_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBB-NEXT:    xor t5, t1, t2
+; RV32ZBB-NEXT:    xor t6, t1, t2
 ; RV32ZBB-NEXT:    xor s0, a6, t0
-; RV32ZBB-NEXT:    or t5, s0, t5
-; RV32ZBB-NEXT:    beqz t5, .LBB12_6
+; RV32ZBB-NEXT:    or t6, s0, t6
+; RV32ZBB-NEXT:    beqz t6, .LBB12_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv a7, t4
+; RV32ZBB-NEXT:    mv a7, t5
 ; RV32ZBB-NEXT:  .LBB12_6:
-; RV32ZBB-NEXT:    mv t5, a5
-; RV32ZBB-NEXT:    beq a2, a4, .LBB12_8
+; RV32ZBB-NEXT:    mv t5, a2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB12_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t5, t6
+; RV32ZBB-NEXT:    mv t5, t4
 ; RV32ZBB-NEXT:  .LBB12_8:
 ; RV32ZBB-NEXT:    sltu t4, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t4
-; RV32ZBB-NEXT:    beq a4, a2, .LBB12_10
+; RV32ZBB-NEXT:    beq a4, a5, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t6, a4, a2
+; RV32ZBB-NEXT:    sltu t6, a4, a5
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    bnez a7, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
@@ -1050,12 +1050,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    add t0, t0, t1
 ; RV32ZBB-NEXT:    bnez a7, .LBB12_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sub a2, a2, a4
-; RV32ZBB-NEXT:    sub a2, a2, a5
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a2, a5, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    j .LBB12_16
 ; RV32ZBB-NEXT:  .LBB12_15:
-; RV32ZBB-NEXT:    sub a4, a4, a2
+; RV32ZBB-NEXT:    sub a4, a4, a5
 ; RV32ZBB-NEXT:    sub a2, a4, t4
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB12_16:
@@ -1382,30 +1382,30 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a5, 12(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    beq a5, t0, .LBB17_2
+; RV32I-NEXT:    lw a7, 12(a1)
+; RV32I-NEXT:    beq a7, t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t1, a5, t0
+; RV32I-NEXT:    slt t1, a7, t0
 ; RV32I-NEXT:    j .LBB17_3
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:  .LBB17_3:
 ; RV32I-NEXT:    lw t2, 0(a2)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    beq a3, a6, .LBB17_5
+; RV32I-NEXT:    beq a3, a5, .LBB17_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    sltu t6, a3, a6
+; RV32I-NEXT:    sltu t6, a3, a5
 ; RV32I-NEXT:    j .LBB17_6
 ; RV32I-NEXT:  .LBB17_5:
 ; RV32I-NEXT:    sltu t6, a1, t2
 ; RV32I-NEXT:  .LBB17_6:
-; RV32I-NEXT:    xor a2, a5, t0
-; RV32I-NEXT:    xor t3, a4, a7
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    xor t3, a4, a6
 ; RV32I-NEXT:    or t5, t3, a2
 ; RV32I-NEXT:    beqz t5, .LBB17_8
 ; RV32I-NEXT:  # %bb.7:
@@ -1413,27 +1413,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB17_8:
 ; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    mv t1, a3
-; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t4, a7
 ; RV32I-NEXT:    mv t3, a4
 ; RV32I-NEXT:    bnez t6, .LBB17_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    mv t1, a5
 ; RV32I-NEXT:    mv t4, t0
-; RV32I-NEXT:    mv t3, a7
+; RV32I-NEXT:    mv t3, a6
 ; RV32I-NEXT:  .LBB17_10:
-; RV32I-NEXT:    beq a5, t0, .LBB17_12
+; RV32I-NEXT:    beq a7, t0, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    slt t6, t0, a5
+; RV32I-NEXT:    slt t6, t0, a7
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
-; RV32I-NEXT:    sltu t6, a7, a4
+; RV32I-NEXT:    sltu t6, a6, a4
 ; RV32I-NEXT:  .LBB17_13:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beq a3, a6, .LBB17_15
+; RV32I-NEXT:    beq a3, a5, .LBB17_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sltu s0, a6, a3
+; RV32I-NEXT:    sltu s0, a5, a3
 ; RV32I-NEXT:    bnez t5, .LBB17_16
 ; RV32I-NEXT:    j .LBB17_17
 ; RV32I-NEXT:  .LBB17_15:
@@ -1445,14 +1445,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    bnez s0, .LBB17_19
 ; RV32I-NEXT:  # %bb.18:
 ; RV32I-NEXT:    mv a1, t2
-; RV32I-NEXT:    mv a3, a6
-; RV32I-NEXT:    mv a5, t0
-; RV32I-NEXT:    mv a4, a7
+; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a7, t0
+; RV32I-NEXT:    mv a4, a6
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a7, t3, a4
-; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a5, t3, a4
+; RV32I-NEXT:    sub a6, t4, a7
+; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sltu a6, a2, a1
-; RV32I-NEXT:    sub a5, a5, a7
 ; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
@@ -1509,30 +1509,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    beq a5, t0, .LBB17_2
+; RV32ZBB-NEXT:    lw a7, 12(a1)
+; RV32ZBB-NEXT:    beq a7, t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t1, a5, t0
+; RV32ZBB-NEXT:    slt t1, a7, t0
 ; RV32ZBB-NEXT:    j .LBB17_3
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:  .LBB17_3:
 ; RV32ZBB-NEXT:    lw t2, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 0(a1)
-; RV32ZBB-NEXT:    beq a3, a6, .LBB17_5
+; RV32ZBB-NEXT:    beq a3, a5, .LBB17_5
 ; RV32ZBB-NEXT:  # %bb.4:
-; RV32ZBB-NEXT:    sltu t6, a3, a6
+; RV32ZBB-NEXT:    sltu t6, a3, a5
 ; RV32ZBB-NEXT:    j .LBB17_6
 ; RV32ZBB-NEXT:  .LBB17_5:
 ; RV32ZBB-NEXT:    sltu t6, a1, t2
 ; RV32ZBB-NEXT:  .LBB17_6:
-; RV32ZBB-NEXT:    xor a2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a4, a7
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    xor t3, a4, a6
 ; RV32ZBB-NEXT:    or t5, t3, a2
 ; RV32ZBB-NEXT:    beqz t5, .LBB17_8
 ; RV32ZBB-NEXT:  # %bb.7:
@@ -1540,27 +1540,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB17_8:
 ; RV32ZBB-NEXT:    mv a2, a1
 ; RV32ZBB-NEXT:    mv t1, a3
-; RV32ZBB-NEXT:    mv t4, a5
+; RV32ZBB-NEXT:    mv t4, a7
 ; RV32ZBB-NEXT:    mv t3, a4
 ; RV32ZBB-NEXT:    bnez t6, .LBB17_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    mv t1, a5
 ; RV32ZBB-NEXT:    mv t4, t0
-; RV32ZBB-NEXT:    mv t3, a7
+; RV32ZBB-NEXT:    mv t3, a6
 ; RV32ZBB-NEXT:  .LBB17_10:
-; RV32ZBB-NEXT:    beq a5, t0, .LBB17_12
+; RV32ZBB-NEXT:    beq a7, t0, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    slt t6, t0, a5
+; RV32ZBB-NEXT:    slt t6, t0, a7
 ; RV32ZBB-NEXT:    j .LBB17_13
 ; RV32ZBB-NEXT:  .LBB17_12:
-; RV32ZBB-NEXT:    sltu t6, a7, a4
+; RV32ZBB-NEXT:    sltu t6, a6, a4
 ; RV32ZBB-NEXT:  .LBB17_13:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBB-NEXT:    beq a3, a6, .LBB17_15
+; RV32ZBB-NEXT:    beq a3, a5, .LBB17_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sltu s0, a6, a3
+; RV32ZBB-NEXT:    sltu s0, a5, a3
 ; RV32ZBB-NEXT:    bnez t5, .LBB17_16
 ; RV32ZBB-NEXT:    j .LBB17_17
 ; RV32ZBB-NEXT:  .LBB17_15:
@@ -1572,14 +1572,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    bnez s0, .LBB17_19
 ; RV32ZBB-NEXT:  # %bb.18:
 ; RV32ZBB-NEXT:    mv a1, t2
-; RV32ZBB-NEXT:    mv a3, a6
-; RV32ZBB-NEXT:    mv a5, t0
-; RV32ZBB-NEXT:    mv a4, a7
+; RV32ZBB-NEXT:    mv a3, a5
+; RV32ZBB-NEXT:    mv a7, t0
+; RV32ZBB-NEXT:    mv a4, a6
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a7, t3, a4
-; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a5, t3, a4
+; RV32ZBB-NEXT:    sub a6, t4, a7
+; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sltu a6, a2, a1
-; RV32ZBB-NEXT:    sub a5, a5, a7
 ; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
@@ -1862,26 +1862,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a7, .LBB22_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    sltu a1, a7, a6
+; RV32I-NEXT:    mv t4, a1
+; RV32I-NEXT:    beq t1, t0, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t4, t0, a7
+; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB22_2:
 ; RV32I-NEXT:    sltu t2, a2, a3
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a4, .LBB22_4
+; RV32I-NEXT:    beq a5, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a4
+; RV32I-NEXT:    sltu t3, a5, a4
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    xor t5, t0, a7
-; RV32I-NEXT:    xor t6, a6, a5
+; RV32I-NEXT:    xor t5, t1, t0
+; RV32I-NEXT:    xor t6, a7, a6
 ; RV32I-NEXT:    or t5, t6, t5
 ; RV32I-NEXT:    mv t6, t3
 ; RV32I-NEXT:    beqz t5, .LBB22_6
@@ -1890,32 +1890,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB22_6:
 ; RV32I-NEXT:    sltu t4, a3, a2
 ; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a4, .LBB22_8
+; RV32I-NEXT:    beq a5, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a4, a1
+; RV32I-NEXT:    sltu t5, a4, a5
 ; RV32I-NEXT:  .LBB22_8:
 ; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t1, a5, a6
-; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a5, a5, a6
-; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a6, a7, t1
-; RV32I-NEXT:    sltu a7, a5, t5
-; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sltu a1, a6, a7
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    sub a6, a6, a7
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a7, t0, a1
+; RV32I-NEXT:    sltu t0, a6, t5
+; RV32I-NEXT:    sub a1, a6, t5
 ; RV32I-NEXT:    sub a5, a4, t4
-; RV32I-NEXT:    sub a4, a6, a7
+; RV32I-NEXT:    sub a4, a7, t0
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a4, a1, a4
-; RV32I-NEXT:    sub a6, a7, t1
-; RV32I-NEXT:    sltu a7, a5, t3
-; RV32I-NEXT:    sub a1, a5, t3
-; RV32I-NEXT:    sub a5, a4, t2
-; RV32I-NEXT:    sub a4, a6, a7
+; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a4, t0, a1
+; RV32I-NEXT:    sltu a7, a6, t3
+; RV32I-NEXT:    sub a1, a6, t3
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
@@ -1949,26 +1949,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a7, .LBB22_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    sltu a1, a7, a6
+; RV32ZBB-NEXT:    mv t4, a1
+; RV32ZBB-NEXT:    beq t1, t0, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t4, t0, a7
+; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_2:
 ; RV32ZBB-NEXT:    sltu t2, a2, a3
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a4, .LBB22_4
+; RV32ZBB-NEXT:    beq a5, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a4
+; RV32ZBB-NEXT:    sltu t3, a5, a4
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    xor t5, t0, a7
-; RV32ZBB-NEXT:    xor t6, a6, a5
+; RV32ZBB-NEXT:    xor t5, t1, t0
+; RV32ZBB-NEXT:    xor t6, a7, a6
 ; RV32ZBB-NEXT:    or t5, t6, t5
 ; RV32ZBB-NEXT:    mv t6, t3
 ; RV32ZBB-NEXT:    beqz t5, .LBB22_6
@@ -1977,32 +1977,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB22_6:
 ; RV32ZBB-NEXT:    sltu t4, a3, a2
 ; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
+; RV32ZBB-NEXT:    beq a5, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a4, a1
+; RV32ZBB-NEXT:    sltu t5, a4, a5
 ; RV32ZBB-NEXT:  .LBB22_8:
 ; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t1, a5, a6
-; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a5, a5, a6
-; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a6, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a5, t5
-; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sltu a1, a6, a7
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a7, t0, a1
+; RV32ZBB-NEXT:    sltu t0, a6, t5
+; RV32ZBB-NEXT:    sub a1, a6, t5
 ; RV32ZBB-NEXT:    sub a5, a4, t4
-; RV32ZBB-NEXT:    sub a4, a6, a7
+; RV32ZBB-NEXT:    sub a4, a7, t0
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a4, a1, a4
-; RV32ZBB-NEXT:    sub a6, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a5, t3
-; RV32ZBB-NEXT:    sub a1, a5, t3
-; RV32ZBB-NEXT:    sub a5, a4, t2
-; RV32ZBB-NEXT:    sub a4, a6, a7
+; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a4, t0, a1
+; RV32ZBB-NEXT:    sltu a7, a6, t3
+; RV32ZBB-NEXT:    sub a1, a6, t3
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
@@ -2391,53 +2391,53 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a6, 12(a2)
-; RV32I-NEXT:    lw t0, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, t0, a5
-; RV32I-NEXT:    sub t1, t1, a6
-; RV32I-NEXT:    sltu a6, a2, a3
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, a6
-; RV32I-NEXT:    beq a7, a4, .LBB31_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw t1, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sltu t0, t1, a6
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sltu a7, a2, a3
+; RV32I-NEXT:    mv t0, a7
+; RV32I-NEXT:    beq a5, a4, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a4
+; RV32I-NEXT:    sltu t0, a5, a4
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a5, t0, a5
-; RV32I-NEXT:    sub a4, a7, a4
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    sltu a2, a5, t1
-; RV32I-NEXT:    sub t0, a4, a6
-; RV32I-NEXT:    sub a4, a5, t1
-; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sltu a3, a6, t0
+; RV32I-NEXT:    sub t1, a5, a7
+; RV32I-NEXT:    sub a4, a6, t0
+; RV32I-NEXT:    sub a5, a1, a3
 ; RV32I-NEXT:    srai a1, a5, 31
-; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    xor a5, a5, a1
-; RV32I-NEXT:    xor a4, a3, a1
-; RV32I-NEXT:    sltu a3, a1, a2
-; RV32I-NEXT:    sub a6, a1, a5
-; RV32I-NEXT:    sltu a5, a1, a4
-; RV32I-NEXT:    sub a3, a6, a3
-; RV32I-NEXT:    xor a7, t0, a1
-; RV32I-NEXT:    mv a6, a5
-; RV32I-NEXT:    beqz t0, .LBB31_4
+; RV32I-NEXT:    xor a3, a4, a1
+; RV32I-NEXT:    xor a6, a5, a1
+; RV32I-NEXT:    xor a5, t1, a1
+; RV32I-NEXT:    xor a4, a2, a1
+; RV32I-NEXT:    sltu a2, a1, a3
+; RV32I-NEXT:    sub a6, a1, a6
+; RV32I-NEXT:    sub a2, a6, a2
+; RV32I-NEXT:    sltu a6, a1, a4
+; RV32I-NEXT:    mv a7, a6
+; RV32I-NEXT:    beqz t1, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a6, a1, a7
+; RV32I-NEXT:    sltu a7, a1, a5
 ; RV32I-NEXT:  .LBB31_4:
-; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sub a5, a1, a5
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sltu a4, a2, a6
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a4, a3, a7
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a5, a5, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128:
@@ -2459,53 +2459,53 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a6, 12(a2)
-; RV32ZBB-NEXT:    lw t0, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, t0, a5
-; RV32ZBB-NEXT:    sub t1, t1, a6
-; RV32ZBB-NEXT:    sltu a6, a2, a3
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, a6
-; RV32ZBB-NEXT:    beq a7, a4, .LBB31_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw t1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    sltu t0, t1, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, t0
+; RV32ZBB-NEXT:    sltu a7, a2, a3
+; RV32ZBB-NEXT:    mv t0, a7
+; RV32ZBB-NEXT:    beq a5, a4, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a4
+; RV32ZBB-NEXT:    sltu t0, a5, a4
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a5, t0, a5
-; RV32ZBB-NEXT:    sub a4, a7, a4
-; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    sltu a2, a5, t1
-; RV32ZBB-NEXT:    sub t0, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t1
-; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a2, a2, a3
+; RV32ZBB-NEXT:    sltu a3, a6, t0
+; RV32ZBB-NEXT:    sub t1, a5, a7
+; RV32ZBB-NEXT:    sub a4, a6, t0
+; RV32ZBB-NEXT:    sub a5, a1, a3
 ; RV32ZBB-NEXT:    srai a1, a5, 31
-; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    xor a5, a5, a1
-; RV32ZBB-NEXT:    xor a4, a3, a1
-; RV32ZBB-NEXT:    sltu a3, a1, a2
-; RV32ZBB-NEXT:    sub a6, a1, a5
-; RV32ZBB-NEXT:    sltu a5, a1, a4
-; RV32ZBB-NEXT:    sub a3, a6, a3
-; RV32ZBB-NEXT:    xor a7, t0, a1
-; RV32ZBB-NEXT:    mv a6, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB31_4
+; RV32ZBB-NEXT:    xor a3, a4, a1
+; RV32ZBB-NEXT:    xor a6, a5, a1
+; RV32ZBB-NEXT:    xor a5, t1, a1
+; RV32ZBB-NEXT:    xor a4, a2, a1
+; RV32ZBB-NEXT:    sltu a2, a1, a3
+; RV32ZBB-NEXT:    sub a6, a1, a6
+; RV32ZBB-NEXT:    sub a2, a6, a2
+; RV32ZBB-NEXT:    sltu a6, a1, a4
+; RV32ZBB-NEXT:    mv a7, a6
+; RV32ZBB-NEXT:    beqz t1, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a6, a1, a7
+; RV32ZBB-NEXT:    sltu a7, a1, a5
 ; RV32ZBB-NEXT:  .LBB31_4:
-; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sub a5, a1, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sltu a4, a2, a6
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a4, a3, a7
+; RV32ZBB-NEXT:    sub a3, a3, a7
+; RV32ZBB-NEXT:    sub a5, a5, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128:
@@ -2533,53 +2533,53 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a6, 12(a2)
-; RV32I-NEXT:    lw t0, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, t0, a5
-; RV32I-NEXT:    sub t1, t1, a6
-; RV32I-NEXT:    sltu a6, a2, a3
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, a6
-; RV32I-NEXT:    beq a7, a4, .LBB32_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw t1, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sltu t0, t1, a6
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sltu a7, a2, a3
+; RV32I-NEXT:    mv t0, a7
+; RV32I-NEXT:    beq a5, a4, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a4
+; RV32I-NEXT:    sltu t0, a5, a4
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a5, t0, a5
-; RV32I-NEXT:    sub a4, a7, a4
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    sltu a2, a5, t1
-; RV32I-NEXT:    sub t0, a4, a6
-; RV32I-NEXT:    sub a4, a5, t1
-; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sltu a3, a6, t0
+; RV32I-NEXT:    sub t1, a5, a7
+; RV32I-NEXT:    sub a4, a6, t0
+; RV32I-NEXT:    sub a5, a1, a3
 ; RV32I-NEXT:    srai a1, a5, 31
-; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    xor a5, a5, a1
-; RV32I-NEXT:    xor a4, a3, a1
-; RV32I-NEXT:    sltu a3, a1, a2
-; RV32I-NEXT:    sub a6, a1, a5
-; RV32I-NEXT:    sltu a5, a1, a4
-; RV32I-NEXT:    sub a3, a6, a3
-; RV32I-NEXT:    xor a7, t0, a1
-; RV32I-NEXT:    mv a6, a5
-; RV32I-NEXT:    beqz t0, .LBB32_4
+; RV32I-NEXT:    xor a3, a4, a1
+; RV32I-NEXT:    xor a6, a5, a1
+; RV32I-NEXT:    xor a5, t1, a1
+; RV32I-NEXT:    xor a4, a2, a1
+; RV32I-NEXT:    sltu a2, a1, a3
+; RV32I-NEXT:    sub a6, a1, a6
+; RV32I-NEXT:    sub a2, a6, a2
+; RV32I-NEXT:    sltu a6, a1, a4
+; RV32I-NEXT:    mv a7, a6
+; RV32I-NEXT:    beqz t1, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a6, a1, a7
+; RV32I-NEXT:    sltu a7, a1, a5
 ; RV32I-NEXT:  .LBB32_4:
-; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sub a5, a1, a5
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sltu a4, a2, a6
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a4, a3, a7
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a5, a5, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128_undef:
@@ -2601,53 +2601,53 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a6, 12(a2)
-; RV32ZBB-NEXT:    lw t0, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, t0, a5
-; RV32ZBB-NEXT:    sub t1, t1, a6
-; RV32ZBB-NEXT:    sltu a6, a2, a3
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, a6
-; RV32ZBB-NEXT:    beq a7, a4, .LBB32_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw t1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    sltu t0, t1, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, t0
+; RV32ZBB-NEXT:    sltu a7, a2, a3
+; RV32ZBB-NEXT:    mv t0, a7
+; RV32ZBB-NEXT:    beq a5, a4, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a4
+; RV32ZBB-NEXT:    sltu t0, a5, a4
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a5, t0, a5
-; RV32ZBB-NEXT:    sub a4, a7, a4
-; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    sltu a2, a5, t1
-; RV32ZBB-NEXT:    sub t0, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t1
-; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a2, a2, a3
+; RV32ZBB-NEXT:    sltu a3, a6, t0
+; RV32ZBB-NEXT:    sub t1, a5, a7
+; RV32ZBB-NEXT:    sub a4, a6, t0
+; RV32ZBB-NEXT:    sub a5, a1, a3
 ; RV32ZBB-NEXT:    srai a1, a5, 31
-; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    xor a5, a5, a1
-; RV32ZBB-NEXT:    xor a4, a3, a1
-; RV32ZBB-NEXT:    sltu a3, a1, a2
-; RV32ZBB-NEXT:    sub a6, a1, a5
-; RV32ZBB-NEXT:    sltu a5, a1, a4
-; RV32ZBB-NEXT:    sub a3, a6, a3
-; RV32ZBB-NEXT:    xor a7, t0, a1
-; RV32ZBB-NEXT:    mv a6, a5
-; RV32ZBB-NEXT:    beqz t0, .LBB32_4
+; RV32ZBB-NEXT:    xor a3, a4, a1
+; RV32ZBB-NEXT:    xor a6, a5, a1
+; RV32ZBB-NEXT:    xor a5, t1, a1
+; RV32ZBB-NEXT:    xor a4, a2, a1
+; RV32ZBB-NEXT:    sltu a2, a1, a3
+; RV32ZBB-NEXT:    sub a6, a1, a6
+; RV32ZBB-NEXT:    sub a2, a6, a2
+; RV32ZBB-NEXT:    sltu a6, a1, a4
+; RV32ZBB-NEXT:    mv a7, a6
+; RV32ZBB-NEXT:    beqz t1, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a6, a1, a7
+; RV32ZBB-NEXT:    sltu a7, a1, a5
 ; RV32ZBB-NEXT:  .LBB32_4:
-; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sub a5, a1, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sltu a4, a2, a6
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a4, a3, a7
+; RV32ZBB-NEXT:    sub a3, a3, a7
+; RV32ZBB-NEXT:    sub a5, a5, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128_undef:
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 56e6dacff9748..9e866220af666 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -538,18 +538,18 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
 ; RV32I-NEXT:    beq t0, t1, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a1, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
@@ -634,18 +634,18 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
 ; RV32ZBB-NEXT:    beq t0, t1, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a1, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -738,18 +738,18 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
 ; RV32I-NEXT:    beq t0, t1, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a1, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
@@ -834,18 +834,18 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
 ; RV32ZBB-NEXT:    beq t0, t1, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a1, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -1127,18 +1127,18 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
 ; RV32I-NEXT:    beq t0, t1, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a1, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
@@ -1223,18 +1223,18 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
 ; RV32ZBB-NEXT:    beq t0, t1, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a1, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -1518,18 +1518,18 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
 ; RV32I-NEXT:    beq t0, t1, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a1, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
@@ -1614,18 +1614,18 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
 ; RV32ZBB-NEXT:    beq t0, t1, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a1, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -2045,27 +2045,27 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a6, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a5
-; RV32I-NEXT:    sub t0, t0, a6
-; RV32I-NEXT:    sltu a6, a2, a4
-; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    mv t1, a6
-; RV32I-NEXT:    beq a1, a3, .LBB31_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sltu t1, t0, a6
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a1, a1, t1
+; RV32I-NEXT:    sltu a7, a2, a4
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beq a5, a3, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a3
+; RV32I-NEXT:    sltu t1, a5, a3
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a3, a1, a3
-; RV32I-NEXT:    sltu a1, a5, t1
-; RV32I-NEXT:    sub a5, a5, t1
-; RV32I-NEXT:    sub a1, t0, a1
-; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub a3, a5, a3
+; RV32I-NEXT:    sltu t0, a6, t1
+; RV32I-NEXT:    sub a5, a6, t1
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sub a3, a3, a7
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
@@ -2108,27 +2108,27 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a6, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a5
-; RV32ZBB-NEXT:    sub t0, t0, a6
-; RV32ZBB-NEXT:    sltu a6, a2, a4
-; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    mv t1, a6
-; RV32ZBB-NEXT:    beq a1, a3, .LBB31_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    sltu t1, t0, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, t1
+; RV32ZBB-NEXT:    sltu a7, a2, a4
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beq a5, a3, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a3
+; RV32ZBB-NEXT:    sltu t1, a5, a3
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a3, a1, a3
-; RV32ZBB-NEXT:    sltu a1, a5, t1
-; RV32ZBB-NEXT:    sub a5, a5, t1
-; RV32ZBB-NEXT:    sub a1, t0, a1
-; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub a3, a5, a3
+; RV32ZBB-NEXT:    sltu t0, a6, t1
+; RV32ZBB-NEXT:    sub a5, a6, t1
+; RV32ZBB-NEXT:    sub a1, a1, t0
+; RV32ZBB-NEXT:    sub a3, a3, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -2176,27 +2176,27 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a6, 12(a2)
-; RV32I-NEXT:    lw a7, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a7, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a7, a5
-; RV32I-NEXT:    sub t0, t0, a6
-; RV32I-NEXT:    sltu a6, a2, a4
-; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    mv t1, a6
-; RV32I-NEXT:    beq a1, a3, .LBB32_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sltu t1, t0, a6
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a1, a1, t1
+; RV32I-NEXT:    sltu a7, a2, a4
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beq a5, a3, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a3
+; RV32I-NEXT:    sltu t1, a5, a3
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a3, a1, a3
-; RV32I-NEXT:    sltu a1, a5, t1
-; RV32I-NEXT:    sub a5, a5, t1
-; RV32I-NEXT:    sub a1, t0, a1
-; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub a3, a5, a3
+; RV32I-NEXT:    sltu t0, a6, t1
+; RV32I-NEXT:    sub a5, a6, t1
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sub a3, a3, a7
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
@@ -2239,27 +2239,27 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a6, 12(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a7, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a7, a5
-; RV32ZBB-NEXT:    sub t0, t0, a6
-; RV32ZBB-NEXT:    sltu a6, a2, a4
-; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    mv t1, a6
-; RV32ZBB-NEXT:    beq a1, a3, .LBB32_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    sltu t1, t0, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, t1
+; RV32ZBB-NEXT:    sltu a7, a2, a4
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beq a5, a3, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a3
+; RV32ZBB-NEXT:    sltu t1, a5, a3
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a3, a1, a3
-; RV32ZBB-NEXT:    sltu a1, a5, t1
-; RV32ZBB-NEXT:    sub a5, a5, t1
-; RV32ZBB-NEXT:    sub a1, t0, a1
-; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub a3, a5, a3
+; RV32ZBB-NEXT:    sltu t0, a6, t1
+; RV32ZBB-NEXT:    sub a5, a6, t1
+; RV32ZBB-NEXT:    sub a1, a1, t0
+; RV32ZBB-NEXT:    sub a3, a3, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
@@ -2541,18 +2541,18 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t0, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
-; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a5, 0(a2)
 ; RV32I-NEXT:    lw a1, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    sltu a2, a7, a6
 ; RV32I-NEXT:    mv t4, a2
 ; RV32I-NEXT:    beq t0, t1, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    slt t4, t1, t0
 ; RV32I-NEXT:  .LBB38_2:
-; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    sltu t5, a1, a4
+; RV32I-NEXT:    sltu t2, a5, a3
 ; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a1, .LBB38_4
 ; RV32I-NEXT:  # %bb.3:
@@ -2637,18 +2637,18 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t0, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
-; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a5, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 4(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    sltu a2, a7, a6
 ; RV32ZBB-NEXT:    mv t4, a2
 ; RV32ZBB-NEXT:    beq t0, t1, .LBB38_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    slt t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB38_2:
-; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    sltu t5, a1, a4
+; RV32ZBB-NEXT:    sltu t2, a5, a3
 ; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a1, .LBB38_4
 ; RV32ZBB-NEXT:  # %bb.3:
diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll
index 9e41cde7ae181..a904def2753db 100644
--- a/llvm/test/CodeGen/RISCV/abdu-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll
@@ -624,24 +624,24 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a7, 4(a2)
 ; RV32I-NEXT:    lw a3, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    sltu a1, a5, a3
 ; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a2, a5
 ; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    sltu t2, a2, a4
 ; RV32I-NEXT:    mv t1, t2
 ; RV32I-NEXT:    beq t0, a7, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
 ; RV32I-NEXT:    sub a3, a3, t1
@@ -650,27 +650,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a4, a3
+; RV32I-NEXT:    sltu t1, a5, a3
 ; RV32I-NEXT:  .LBB11_5:
 ; RV32I-NEXT:    sub a7, t0, a7
 ; RV32I-NEXT:    sub a7, a7, t2
-; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    sub t2, a2, a4
 ; RV32I-NEXT:    beq a7, t0, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
 ; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
-; RV32I-NEXT:    sltu a2, a2, a5
+; RV32I-NEXT:    sltu a2, a2, t2
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a6, a1, a6
-; RV32I-NEXT:    xor a4, a3, a4
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    xor a4, a1, a6
+; RV32I-NEXT:    xor a5, a3, a5
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    beqz a4, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t0, t2, a4
 ; RV32I-NEXT:    xor t3, a7, a4
 ; RV32I-NEXT:    sltu a5, t0, a4
 ; RV32I-NEXT:    add a6, t3, a2
@@ -736,24 +736,24 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a7, 4(a2)
 ; RV32ZBB-NEXT:    lw a3, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    lw a5, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    sltu a1, a5, a3
 ; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a2, a5
 ; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    sltu t2, a2, a4
 ; RV32ZBB-NEXT:    mv t1, t2
 ; RV32ZBB-NEXT:    beq t0, a7, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
 ; RV32ZBB-NEXT:    sub a3, a3, t1
@@ -762,27 +762,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a3
+; RV32ZBB-NEXT:    sltu t1, a5, a3
 ; RV32ZBB-NEXT:  .LBB11_5:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
 ; RV32ZBB-NEXT:    sub a7, a7, t2
-; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    sub t2, a2, a4
 ; RV32ZBB-NEXT:    beq a7, t0, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
 ; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
-; RV32ZBB-NEXT:    sltu a2, a2, a5
+; RV32ZBB-NEXT:    sltu a2, a2, t2
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a6, a1, a6
-; RV32ZBB-NEXT:    xor a4, a3, a4
-; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    xor a4, a1, a6
+; RV32ZBB-NEXT:    xor a5, a3, a5
+; RV32ZBB-NEXT:    or a4, a5, a4
 ; RV32ZBB-NEXT:    beqz a4, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    neg a4, a2
-; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t0, t2, a4
 ; RV32ZBB-NEXT:    xor t3, a7, a4
 ; RV32ZBB-NEXT:    sltu a5, t0, a4
 ; RV32ZBB-NEXT:    add a6, t3, a2
@@ -857,24 +857,24 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a7, 4(a2)
 ; RV32I-NEXT:    lw a3, 8(a2)
 ; RV32I-NEXT:    lw t1, 12(a2)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    sltu a1, a5, a3
 ; RV32I-NEXT:    sub t1, a6, t1
-; RV32I-NEXT:    sltu t2, a2, a5
 ; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    sltu t2, a2, a4
 ; RV32I-NEXT:    mv t1, t2
 ; RV32I-NEXT:    beq t0, a7, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
 ; RV32I-NEXT:    sub a3, a3, t1
@@ -883,27 +883,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a4, a3
+; RV32I-NEXT:    sltu t1, a5, a3
 ; RV32I-NEXT:  .LBB12_5:
 ; RV32I-NEXT:    sub a7, t0, a7
 ; RV32I-NEXT:    sub a7, a7, t2
-; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    sub t2, a2, a4
 ; RV32I-NEXT:    beq a7, t0, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
 ; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    sltu a2, a2, a5
+; RV32I-NEXT:    sltu a2, a2, t2
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a6, a1, a6
-; RV32I-NEXT:    xor a4, a3, a4
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    xor a4, a1, a6
+; RV32I-NEXT:    xor a5, a3, a5
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    beqz a4, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t0, t2, a4
 ; RV32I-NEXT:    xor t3, a7, a4
 ; RV32I-NEXT:    sltu a5, t0, a4
 ; RV32I-NEXT:    add a6, t3, a2
@@ -969,24 +969,24 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a7, 4(a2)
 ; RV32ZBB-NEXT:    lw a3, 8(a2)
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a6, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    lw a5, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    sltu a1, a5, a3
 ; RV32ZBB-NEXT:    sub t1, a6, t1
-; RV32ZBB-NEXT:    sltu t2, a2, a5
 ; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    sltu t2, a2, a4
 ; RV32ZBB-NEXT:    mv t1, t2
 ; RV32ZBB-NEXT:    beq t0, a7, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
 ; RV32ZBB-NEXT:    sub a3, a3, t1
@@ -995,27 +995,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a4, a3
+; RV32ZBB-NEXT:    sltu t1, a5, a3
 ; RV32ZBB-NEXT:  .LBB12_5:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
 ; RV32ZBB-NEXT:    sub a7, a7, t2
-; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    sub t2, a2, a4
 ; RV32ZBB-NEXT:    beq a7, t0, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
 ; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
-; RV32ZBB-NEXT:    sltu a2, a2, a5
+; RV32ZBB-NEXT:    sltu a2, a2, t2
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a6, a1, a6
-; RV32ZBB-NEXT:    xor a4, a3, a4
-; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    xor a4, a1, a6
+; RV32ZBB-NEXT:    xor a5, a3, a5
+; RV32ZBB-NEXT:    or a4, a5, a4
 ; RV32ZBB-NEXT:    beqz a4, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    neg a4, a2
-; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t0, t2, a4
 ; RV32ZBB-NEXT:    xor t3, a7, a4
 ; RV32ZBB-NEXT:    sltu a5, t0, a4
 ; RV32ZBB-NEXT:    add a6, t3, a2
@@ -1335,30 +1335,30 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a5, 4(a2)
+; RV32I-NEXT:    lw a6, 8(a2)
 ; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a5, 12(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    beq a5, t0, .LBB17_2
+; RV32I-NEXT:    lw a7, 12(a1)
+; RV32I-NEXT:    beq a7, t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:    sltu t1, a7, t0
 ; RV32I-NEXT:    j .LBB17_3
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    sltu t1, a4, a7
+; RV32I-NEXT:    sltu t1, a4, a6
 ; RV32I-NEXT:  .LBB17_3:
 ; RV32I-NEXT:    lw t2, 0(a2)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    beq a3, a6, .LBB17_5
+; RV32I-NEXT:    beq a3, a5, .LBB17_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    sltu t6, a3, a6
+; RV32I-NEXT:    sltu t6, a3, a5
 ; RV32I-NEXT:    j .LBB17_6
 ; RV32I-NEXT:  .LBB17_5:
 ; RV32I-NEXT:    sltu t6, a1, t2
 ; RV32I-NEXT:  .LBB17_6:
-; RV32I-NEXT:    xor a2, a5, t0
-; RV32I-NEXT:    xor t3, a4, a7
+; RV32I-NEXT:    xor a2, a7, t0
+; RV32I-NEXT:    xor t3, a4, a6
 ; RV32I-NEXT:    or t5, t3, a2
 ; RV32I-NEXT:    beqz t5, .LBB17_8
 ; RV32I-NEXT:  # %bb.7:
@@ -1366,27 +1366,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB17_8:
 ; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    mv t1, a3
-; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    mv t4, a7
 ; RV32I-NEXT:    mv t3, a4
 ; RV32I-NEXT:    bnez t6, .LBB17_10
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    mv t1, a5
 ; RV32I-NEXT:    mv t4, t0
-; RV32I-NEXT:    mv t3, a7
+; RV32I-NEXT:    mv t3, a6
 ; RV32I-NEXT:  .LBB17_10:
-; RV32I-NEXT:    beq a5, t0, .LBB17_12
+; RV32I-NEXT:    beq a7, t0, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t6, t0, a5
+; RV32I-NEXT:    sltu t6, t0, a7
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
-; RV32I-NEXT:    sltu t6, a7, a4
+; RV32I-NEXT:    sltu t6, a6, a4
 ; RV32I-NEXT:  .LBB17_13:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    beq a3, a6, .LBB17_15
+; RV32I-NEXT:    beq a3, a5, .LBB17_15
 ; RV32I-NEXT:  # %bb.14:
-; RV32I-NEXT:    sltu s0, a6, a3
+; RV32I-NEXT:    sltu s0, a5, a3
 ; RV32I-NEXT:    bnez t5, .LBB17_16
 ; RV32I-NEXT:    j .LBB17_17
 ; RV32I-NEXT:  .LBB17_15:
@@ -1398,14 +1398,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    bnez s0, .LBB17_19
 ; RV32I-NEXT:  # %bb.18:
 ; RV32I-NEXT:    mv a1, t2
-; RV32I-NEXT:    mv a3, a6
-; RV32I-NEXT:    mv a5, t0
-; RV32I-NEXT:    mv a4, a7
+; RV32I-NEXT:    mv a3, a5
+; RV32I-NEXT:    mv a7, t0
+; RV32I-NEXT:    mv a4, a6
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a7, t3, a4
-; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a5, t3, a4
+; RV32I-NEXT:    sub a6, t4, a7
+; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sltu a6, a2, a1
-; RV32I-NEXT:    sub a5, a5, a7
 ; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
@@ -1462,30 +1462,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ;
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw a5, 4(a2)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
 ; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
 ; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    beq a5, t0, .LBB17_2
+; RV32ZBB-NEXT:    lw a7, 12(a1)
+; RV32ZBB-NEXT:    beq a7, t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:    sltu t1, a7, t0
 ; RV32ZBB-NEXT:    j .LBB17_3
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    sltu t1, a4, a7
+; RV32ZBB-NEXT:    sltu t1, a4, a6
 ; RV32ZBB-NEXT:  .LBB17_3:
 ; RV32ZBB-NEXT:    lw t2, 0(a2)
 ; RV32ZBB-NEXT:    lw a1, 0(a1)
-; RV32ZBB-NEXT:    beq a3, a6, .LBB17_5
+; RV32ZBB-NEXT:    beq a3, a5, .LBB17_5
 ; RV32ZBB-NEXT:  # %bb.4:
-; RV32ZBB-NEXT:    sltu t6, a3, a6
+; RV32ZBB-NEXT:    sltu t6, a3, a5
 ; RV32ZBB-NEXT:    j .LBB17_6
 ; RV32ZBB-NEXT:  .LBB17_5:
 ; RV32ZBB-NEXT:    sltu t6, a1, t2
 ; RV32ZBB-NEXT:  .LBB17_6:
-; RV32ZBB-NEXT:    xor a2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a4, a7
+; RV32ZBB-NEXT:    xor a2, a7, t0
+; RV32ZBB-NEXT:    xor t3, a4, a6
 ; RV32ZBB-NEXT:    or t5, t3, a2
 ; RV32ZBB-NEXT:    beqz t5, .LBB17_8
 ; RV32ZBB-NEXT:  # %bb.7:
@@ -1493,27 +1493,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB17_8:
 ; RV32ZBB-NEXT:    mv a2, a1
 ; RV32ZBB-NEXT:    mv t1, a3
-; RV32ZBB-NEXT:    mv t4, a5
+; RV32ZBB-NEXT:    mv t4, a7
 ; RV32ZBB-NEXT:    mv t3, a4
 ; RV32ZBB-NEXT:    bnez t6, .LBB17_10
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    mv t1, a5
 ; RV32ZBB-NEXT:    mv t4, t0
-; RV32ZBB-NEXT:    mv t3, a7
+; RV32ZBB-NEXT:    mv t3, a6
 ; RV32ZBB-NEXT:  .LBB17_10:
-; RV32ZBB-NEXT:    beq a5, t0, .LBB17_12
+; RV32ZBB-NEXT:    beq a7, t0, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t6, t0, a5
+; RV32ZBB-NEXT:    sltu t6, t0, a7
 ; RV32ZBB-NEXT:    j .LBB17_13
 ; RV32ZBB-NEXT:  .LBB17_12:
-; RV32ZBB-NEXT:    sltu t6, a7, a4
+; RV32ZBB-NEXT:    sltu t6, a6, a4
 ; RV32ZBB-NEXT:  .LBB17_13:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBB-NEXT:    beq a3, a6, .LBB17_15
+; RV32ZBB-NEXT:    beq a3, a5, .LBB17_15
 ; RV32ZBB-NEXT:  # %bb.14:
-; RV32ZBB-NEXT:    sltu s0, a6, a3
+; RV32ZBB-NEXT:    sltu s0, a5, a3
 ; RV32ZBB-NEXT:    bnez t5, .LBB17_16
 ; RV32ZBB-NEXT:    j .LBB17_17
 ; RV32ZBB-NEXT:  .LBB17_15:
@@ -1525,14 +1525,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    bnez s0, .LBB17_19
 ; RV32ZBB-NEXT:  # %bb.18:
 ; RV32ZBB-NEXT:    mv a1, t2
-; RV32ZBB-NEXT:    mv a3, a6
-; RV32ZBB-NEXT:    mv a5, t0
-; RV32ZBB-NEXT:    mv a4, a7
+; RV32ZBB-NEXT:    mv a3, a5
+; RV32ZBB-NEXT:    mv a7, t0
+; RV32ZBB-NEXT:    mv a4, a6
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a7, t3, a4
-; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a5, t3, a4
+; RV32ZBB-NEXT:    sub a6, t4, a7
+; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sltu a6, a2, a1
-; RV32ZBB-NEXT:    sub a5, a5, a7
 ; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
@@ -1799,26 +1799,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
-; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw t0, 12(a2)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    mv t4, t1
-; RV32I-NEXT:    beq t0, a7, .LBB22_2
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
+; RV32I-NEXT:    sltu a1, a7, a6
+; RV32I-NEXT:    mv t4, a1
+; RV32I-NEXT:    beq t1, t0, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t4, t0, a7
+; RV32I-NEXT:    sltu t4, t1, t0
 ; RV32I-NEXT:  .LBB22_2:
 ; RV32I-NEXT:    sltu t2, a2, a3
 ; RV32I-NEXT:    mv t3, t2
-; RV32I-NEXT:    beq a1, a4, .LBB22_4
+; RV32I-NEXT:    beq a5, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t3, a1, a4
+; RV32I-NEXT:    sltu t3, a5, a4
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    xor t5, t0, a7
-; RV32I-NEXT:    xor t6, a6, a5
+; RV32I-NEXT:    xor t5, t1, t0
+; RV32I-NEXT:    xor t6, a7, a6
 ; RV32I-NEXT:    or t5, t6, t5
 ; RV32I-NEXT:    mv t6, t3
 ; RV32I-NEXT:    beqz t5, .LBB22_6
@@ -1827,32 +1827,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB22_6:
 ; RV32I-NEXT:    sltu t4, a3, a2
 ; RV32I-NEXT:    mv t5, t4
-; RV32I-NEXT:    beq a1, a4, .LBB22_8
+; RV32I-NEXT:    beq a5, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    sltu t5, a4, a1
+; RV32I-NEXT:    sltu t5, a4, a5
 ; RV32I-NEXT:  .LBB22_8:
 ; RV32I-NEXT:    bnez t6, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu t1, a5, a6
-; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a5, a5, a6
-; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a6, a7, t1
-; RV32I-NEXT:    sltu a7, a5, t5
-; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sltu a1, a6, a7
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    sub a6, a6, a7
+; RV32I-NEXT:    sub a4, a4, a5
+; RV32I-NEXT:    sub a7, t0, a1
+; RV32I-NEXT:    sltu t0, a6, t5
+; RV32I-NEXT:    sub a1, a6, t5
 ; RV32I-NEXT:    sub a5, a4, t4
-; RV32I-NEXT:    sub a4, a6, a7
+; RV32I-NEXT:    sub a4, a7, t0
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a4, a1, a4
-; RV32I-NEXT:    sub a6, a7, t1
-; RV32I-NEXT:    sltu a7, a5, t3
-; RV32I-NEXT:    sub a1, a5, t3
-; RV32I-NEXT:    sub a5, a4, t2
-; RV32I-NEXT:    sub a4, a6, a7
+; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sub a4, t0, a1
+; RV32I-NEXT:    sltu a7, a6, t3
+; RV32I-NEXT:    sub a1, a6, t3
+; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
@@ -1886,26 +1886,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
-; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    mv t4, t1
-; RV32ZBB-NEXT:    beq t0, a7, .LBB22_2
+; RV32ZBB-NEXT:    lw a5, 4(a1)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
+; RV32ZBB-NEXT:    sltu a1, a7, a6
+; RV32ZBB-NEXT:    mv t4, a1
+; RV32ZBB-NEXT:    beq t1, t0, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t4, t0, a7
+; RV32ZBB-NEXT:    sltu t4, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_2:
 ; RV32ZBB-NEXT:    sltu t2, a2, a3
 ; RV32ZBB-NEXT:    mv t3, t2
-; RV32ZBB-NEXT:    beq a1, a4, .LBB22_4
+; RV32ZBB-NEXT:    beq a5, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t3, a1, a4
+; RV32ZBB-NEXT:    sltu t3, a5, a4
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    xor t5, t0, a7
-; RV32ZBB-NEXT:    xor t6, a6, a5
+; RV32ZBB-NEXT:    xor t5, t1, t0
+; RV32ZBB-NEXT:    xor t6, a7, a6
 ; RV32ZBB-NEXT:    or t5, t6, t5
 ; RV32ZBB-NEXT:    mv t6, t3
 ; RV32ZBB-NEXT:    beqz t5, .LBB22_6
@@ -1914,32 +1914,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB22_6:
 ; RV32ZBB-NEXT:    sltu t4, a3, a2
 ; RV32ZBB-NEXT:    mv t5, t4
-; RV32ZBB-NEXT:    beq a1, a4, .LBB22_8
+; RV32ZBB-NEXT:    beq a5, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    sltu t5, a4, a1
+; RV32ZBB-NEXT:    sltu t5, a4, a5
 ; RV32ZBB-NEXT:  .LBB22_8:
 ; RV32ZBB-NEXT:    bnez t6, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    sltu t1, a5, a6
-; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a5, a5, a6
-; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a6, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a5, t5
-; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sltu a1, a6, a7
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    sub a6, a6, a7
+; RV32ZBB-NEXT:    sub a4, a4, a5
+; RV32ZBB-NEXT:    sub a7, t0, a1
+; RV32ZBB-NEXT:    sltu t0, a6, t5
+; RV32ZBB-NEXT:    sub a1, a6, t5
 ; RV32ZBB-NEXT:    sub a5, a4, t4
-; RV32ZBB-NEXT:    sub a4, a6, a7
+; RV32ZBB-NEXT:    sub a4, a7, t0
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a4, a1, a4
-; RV32ZBB-NEXT:    sub a6, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a5, t3
-; RV32ZBB-NEXT:    sub a1, a5, t3
-; RV32ZBB-NEXT:    sub a5, a4, t2
-; RV32ZBB-NEXT:    sub a4, a6, a7
+; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub a6, a7, a6
+; RV32ZBB-NEXT:    sub a5, a5, a4
+; RV32ZBB-NEXT:    sub a4, t0, a1
+; RV32ZBB-NEXT:    sltu a7, a6, t3
+; RV32ZBB-NEXT:    sub a1, a6, t3
+; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 7c8638cb461e2..899c12a2e128d 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -541,75 +541,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
 ; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw t0, 0(a1)
+; RV32I-NEXT:    lw t1, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw t0, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sltu a1, a2, a5
 ; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t2, t0, a3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, a5, .LBB11_2
+; RV32I-NEXT:    sub a7, a7, a1
+; RV32I-NEXT:    sltu a1, t0, a3
+; RV32I-NEXT:    mv t2, a1
+; RV32I-NEXT:    beq t1, a6, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a5
+; RV32I-NEXT:    sltu t2, t1, a6
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub t3, a2, a6
-; RV32I-NEXT:    sltu a6, t3, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t3, t1
-; RV32I-NEXT:    beq a6, a4, .LBB11_4
+; RV32I-NEXT:    sub t3, a2, a5
+; RV32I-NEXT:    sltu a5, t3, t2
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a7, t3, t2
+; RV32I-NEXT:    beq a5, a4, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:    sltu t2, a4, a5
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a2, a7
+; RV32I-NEXT:    sltu t2, a2, a7
 ; RV32I-NEXT:  .LBB11_5:
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a6, a6, a1
 ; RV32I-NEXT:    sub a3, t0, a3
-; RV32I-NEXT:    beq a5, a1, .LBB11_7
+; RV32I-NEXT:    beq a6, t1, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, a5
+; RV32I-NEXT:    sltu a1, t1, a6
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
 ; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a4, a5, a4
 ; RV32I-NEXT:    xor a2, a7, a2
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    beqz a2, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    neg t0, a1
-; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    xor a4, a3, t0
-; RV32I-NEXT:    sltu a3, a2, t0
-; RV32I-NEXT:    add a7, a6, a1
-; RV32I-NEXT:    sltu a6, a4, t0
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    xor t1, a5, t0
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    beqz a5, .LBB11_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu a7, t1, t0
-; RV32I-NEXT:  .LBB11_12:
+; RV32I-NEXT:    xor a4, a7, t0
+; RV32I-NEXT:    xor a2, a5, t0
+; RV32I-NEXT:    xor a5, a6, t0
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a7, a4, t0
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    add t1, t1, a1
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    sltu a4, a2, a7
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a5, t1, a6
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a7, a3, t0
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beqz a6, .LBB11_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:  .LBB11_12:
+; RV32I-NEXT:    add a4, a4, a1
+; RV32I-NEXT:    add a5, a5, a1
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    sltu a3, a4, t1
+; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128:
@@ -637,75 +637,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
 ; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 4(a1)
 ; RV32ZBB-NEXT:    lw a2, 8(a1)
 ; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sltu a1, a2, a5
 ; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t2, t0, a3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB11_2
+; RV32ZBB-NEXT:    sub a7, a7, a1
+; RV32ZBB-NEXT:    sltu a1, t0, a3
+; RV32ZBB-NEXT:    mv t2, a1
+; RV32ZBB-NEXT:    beq t1, a6, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a5
+; RV32ZBB-NEXT:    sltu t2, t1, a6
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub t3, a2, a6
-; RV32ZBB-NEXT:    sltu a6, t3, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t3, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB11_4
+; RV32ZBB-NEXT:    sub t3, a2, a5
+; RV32ZBB-NEXT:    sltu a5, t3, t2
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a7, t3, t2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a4, a6
+; RV32ZBB-NEXT:    sltu t2, a4, a5
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a2, a7
+; RV32ZBB-NEXT:    sltu t2, a2, a7
 ; RV32ZBB-NEXT:  .LBB11_5:
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a6, a6, a1
 ; RV32ZBB-NEXT:    sub a3, t0, a3
-; RV32ZBB-NEXT:    beq a5, a1, .LBB11_7
+; RV32ZBB-NEXT:    beq a6, t1, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, a5
+; RV32ZBB-NEXT:    sltu a1, t1, a6
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
 ; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a4, a5, a4
 ; RV32ZBB-NEXT:    xor a2, a7, a2
 ; RV32ZBB-NEXT:    or a2, a2, a4
 ; RV32ZBB-NEXT:    beqz a2, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    mv a1, t2
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    neg t0, a1
-; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    xor a4, a3, t0
-; RV32ZBB-NEXT:    sltu a3, a2, t0
-; RV32ZBB-NEXT:    add a7, a6, a1
-; RV32ZBB-NEXT:    sltu a6, a4, t0
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    xor t1, a5, t0
-; RV32ZBB-NEXT:    mv a7, a6
-; RV32ZBB-NEXT:    beqz a5, .LBB11_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu a7, t1, t0
-; RV32ZBB-NEXT:  .LBB11_12:
+; RV32ZBB-NEXT:    xor a4, a7, t0
+; RV32ZBB-NEXT:    xor a2, a5, t0
+; RV32ZBB-NEXT:    xor a5, a6, t0
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a7, a4, t0
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    add t1, t1, a1
-; RV32ZBB-NEXT:    add a1, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a5, t1, a6
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a7, a3, t0
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beqz a6, .LBB11_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:  .LBB11_12:
+; RV32ZBB-NEXT:    add a4, a4, a1
+; RV32ZBB-NEXT:    add a5, a5, a1
+; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a4, t1
+; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128:
@@ -741,75 +741,75 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
 ; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw t0, 0(a1)
+; RV32I-NEXT:    lw t1, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw t0, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sltu a1, a2, a5
 ; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t2, t0, a3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, a5, .LBB12_2
+; RV32I-NEXT:    sub a7, a7, a1
+; RV32I-NEXT:    sltu a1, t0, a3
+; RV32I-NEXT:    mv t2, a1
+; RV32I-NEXT:    beq t1, a6, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a5
+; RV32I-NEXT:    sltu t2, t1, a6
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub t3, a2, a6
-; RV32I-NEXT:    sltu a6, t3, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t3, t1
-; RV32I-NEXT:    beq a6, a4, .LBB12_4
+; RV32I-NEXT:    sub t3, a2, a5
+; RV32I-NEXT:    sltu a5, t3, t2
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a7, t3, t2
+; RV32I-NEXT:    beq a5, a4, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:    sltu t2, a4, a5
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a2, a7
+; RV32I-NEXT:    sltu t2, a2, a7
 ; RV32I-NEXT:  .LBB12_5:
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a6, a6, a1
 ; RV32I-NEXT:    sub a3, t0, a3
-; RV32I-NEXT:    beq a5, a1, .LBB12_7
+; RV32I-NEXT:    beq a6, t1, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, a5
+; RV32I-NEXT:    sltu a1, t1, a6
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
 ; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a4, a5, a4
 ; RV32I-NEXT:    xor a2, a7, a2
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    beqz a2, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    neg t0, a1
-; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    xor a4, a3, t0
-; RV32I-NEXT:    sltu a3, a2, t0
-; RV32I-NEXT:    add a7, a6, a1
-; RV32I-NEXT:    sltu a6, a4, t0
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    xor t1, a5, t0
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    beqz a5, .LBB12_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu a7, t1, t0
-; RV32I-NEXT:  .LBB12_12:
+; RV32I-NEXT:    xor a4, a7, t0
+; RV32I-NEXT:    xor a2, a5, t0
+; RV32I-NEXT:    xor a5, a6, t0
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a7, a4, t0
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    add t1, t1, a1
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    sltu a4, a2, a7
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a5, t1, a6
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a7, a3, t0
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beqz a6, .LBB12_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:  .LBB12_12:
+; RV32I-NEXT:    add a4, a4, a1
+; RV32I-NEXT:    add a5, a5, a1
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    sltu a3, a4, t1
+; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128_undef:
@@ -837,75 +837,75 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
 ; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 4(a1)
 ; RV32ZBB-NEXT:    lw a2, 8(a1)
 ; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sltu a1, a2, a5
 ; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t2, t0, a3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB12_2
+; RV32ZBB-NEXT:    sub a7, a7, a1
+; RV32ZBB-NEXT:    sltu a1, t0, a3
+; RV32ZBB-NEXT:    mv t2, a1
+; RV32ZBB-NEXT:    beq t1, a6, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a5
+; RV32ZBB-NEXT:    sltu t2, t1, a6
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub t3, a2, a6
-; RV32ZBB-NEXT:    sltu a6, t3, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t3, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB12_4
+; RV32ZBB-NEXT:    sub t3, a2, a5
+; RV32ZBB-NEXT:    sltu a5, t3, t2
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a7, t3, t2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a4, a6
+; RV32ZBB-NEXT:    sltu t2, a4, a5
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a2, a7
+; RV32ZBB-NEXT:    sltu t2, a2, a7
 ; RV32ZBB-NEXT:  .LBB12_5:
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a6, a6, a1
 ; RV32ZBB-NEXT:    sub a3, t0, a3
-; RV32ZBB-NEXT:    beq a5, a1, .LBB12_7
+; RV32ZBB-NEXT:    beq a6, t1, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, a5
+; RV32ZBB-NEXT:    sltu a1, t1, a6
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
 ; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a4, a5, a4
 ; RV32ZBB-NEXT:    xor a2, a7, a2
 ; RV32ZBB-NEXT:    or a2, a2, a4
 ; RV32ZBB-NEXT:    beqz a2, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    mv a1, t2
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    neg t0, a1
-; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    xor a4, a3, t0
-; RV32ZBB-NEXT:    sltu a3, a2, t0
-; RV32ZBB-NEXT:    add a7, a6, a1
-; RV32ZBB-NEXT:    sltu a6, a4, t0
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    xor t1, a5, t0
-; RV32ZBB-NEXT:    mv a7, a6
-; RV32ZBB-NEXT:    beqz a5, .LBB12_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu a7, t1, t0
-; RV32ZBB-NEXT:  .LBB12_12:
+; RV32ZBB-NEXT:    xor a4, a7, t0
+; RV32ZBB-NEXT:    xor a2, a5, t0
+; RV32ZBB-NEXT:    xor a5, a6, t0
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a7, a4, t0
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    add t1, t1, a1
-; RV32ZBB-NEXT:    add a1, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a5, t1, a6
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a7, a3, t0
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beqz a6, .LBB12_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:  .LBB12_12:
+; RV32ZBB-NEXT:    add a4, a4, a1
+; RV32ZBB-NEXT:    add a5, a5, a1
+; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a4, t1
+; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128_undef:
@@ -1132,75 +1132,75 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
 ; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw t0, 0(a1)
+; RV32I-NEXT:    lw t1, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw t0, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sltu a1, a2, a5
 ; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t2, t0, a3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, a5, .LBB17_2
+; RV32I-NEXT:    sub a7, a7, a1
+; RV32I-NEXT:    sltu a1, t0, a3
+; RV32I-NEXT:    mv t2, a1
+; RV32I-NEXT:    beq t1, a6, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a5
+; RV32I-NEXT:    sltu t2, t1, a6
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    sub t3, a2, a6
-; RV32I-NEXT:    sltu a6, t3, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t3, t1
-; RV32I-NEXT:    beq a6, a4, .LBB17_4
+; RV32I-NEXT:    sub t3, a2, a5
+; RV32I-NEXT:    sltu a5, t3, t2
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a7, t3, t2
+; RV32I-NEXT:    beq a5, a4, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:    sltu t2, a4, a5
 ; RV32I-NEXT:    j .LBB17_5
 ; RV32I-NEXT:  .LBB17_4:
-; RV32I-NEXT:    sltu t1, a2, a7
+; RV32I-NEXT:    sltu t2, a2, a7
 ; RV32I-NEXT:  .LBB17_5:
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a6, a6, a1
 ; RV32I-NEXT:    sub a3, t0, a3
-; RV32I-NEXT:    beq a5, a1, .LBB17_7
+; RV32I-NEXT:    beq a6, t1, .LBB17_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, a5
+; RV32I-NEXT:    sltu a1, t1, a6
 ; RV32I-NEXT:    j .LBB17_8
 ; RV32I-NEXT:  .LBB17_7:
 ; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB17_8:
-; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a4, a5, a4
 ; RV32I-NEXT:    xor a2, a7, a2
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    beqz a2, .LBB17_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:  .LBB17_10:
 ; RV32I-NEXT:    neg t0, a1
-; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    xor a4, a3, t0
-; RV32I-NEXT:    sltu a3, a2, t0
-; RV32I-NEXT:    add a7, a6, a1
-; RV32I-NEXT:    sltu a6, a4, t0
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    xor t1, a5, t0
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    beqz a5, .LBB17_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu a7, t1, t0
-; RV32I-NEXT:  .LBB17_12:
+; RV32I-NEXT:    xor a4, a7, t0
+; RV32I-NEXT:    xor a2, a5, t0
+; RV32I-NEXT:    xor a5, a6, t0
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a7, a4, t0
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    add t1, t1, a1
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    sltu a4, a2, a7
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a5, t1, a6
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a7, a3, t0
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beqz a6, .LBB17_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:  .LBB17_12:
+; RV32I-NEXT:    add a4, a4, a1
+; RV32I-NEXT:    add a5, a5, a1
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    sltu a3, a4, t1
+; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_minmax_i128:
@@ -1228,75 +1228,75 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_minmax_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
 ; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 4(a1)
 ; RV32ZBB-NEXT:    lw a2, 8(a1)
 ; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sltu a1, a2, a5
 ; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t2, t0, a3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB17_2
+; RV32ZBB-NEXT:    sub a7, a7, a1
+; RV32ZBB-NEXT:    sltu a1, t0, a3
+; RV32ZBB-NEXT:    mv t2, a1
+; RV32ZBB-NEXT:    beq t1, a6, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a5
+; RV32ZBB-NEXT:    sltu t2, t1, a6
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    sub t3, a2, a6
-; RV32ZBB-NEXT:    sltu a6, t3, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t3, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB17_4
+; RV32ZBB-NEXT:    sub t3, a2, a5
+; RV32ZBB-NEXT:    sltu a5, t3, t2
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a7, t3, t2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a4, a6
+; RV32ZBB-NEXT:    sltu t2, a4, a5
 ; RV32ZBB-NEXT:    j .LBB17_5
 ; RV32ZBB-NEXT:  .LBB17_4:
-; RV32ZBB-NEXT:    sltu t1, a2, a7
+; RV32ZBB-NEXT:    sltu t2, a2, a7
 ; RV32ZBB-NEXT:  .LBB17_5:
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a6, a6, a1
 ; RV32ZBB-NEXT:    sub a3, t0, a3
-; RV32ZBB-NEXT:    beq a5, a1, .LBB17_7
+; RV32ZBB-NEXT:    beq a6, t1, .LBB17_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, a5
+; RV32ZBB-NEXT:    sltu a1, t1, a6
 ; RV32ZBB-NEXT:    j .LBB17_8
 ; RV32ZBB-NEXT:  .LBB17_7:
 ; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB17_8:
-; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a4, a5, a4
 ; RV32ZBB-NEXT:    xor a2, a7, a2
 ; RV32ZBB-NEXT:    or a2, a2, a4
 ; RV32ZBB-NEXT:    beqz a2, .LBB17_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    mv a1, t2
 ; RV32ZBB-NEXT:  .LBB17_10:
 ; RV32ZBB-NEXT:    neg t0, a1
-; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    xor a4, a3, t0
-; RV32ZBB-NEXT:    sltu a3, a2, t0
-; RV32ZBB-NEXT:    add a7, a6, a1
-; RV32ZBB-NEXT:    sltu a6, a4, t0
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    xor t1, a5, t0
-; RV32ZBB-NEXT:    mv a7, a6
-; RV32ZBB-NEXT:    beqz a5, .LBB17_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu a7, t1, t0
-; RV32ZBB-NEXT:  .LBB17_12:
+; RV32ZBB-NEXT:    xor a4, a7, t0
+; RV32ZBB-NEXT:    xor a2, a5, t0
+; RV32ZBB-NEXT:    xor a5, a6, t0
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a7, a4, t0
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    add t1, t1, a1
-; RV32ZBB-NEXT:    add a1, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a5, t1, a6
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a7, a3, t0
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beqz a6, .LBB17_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:  .LBB17_12:
+; RV32ZBB-NEXT:    add a4, a4, a1
+; RV32ZBB-NEXT:    add a5, a5, a1
+; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a4, t1
+; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_minmax_i128:
@@ -1525,75 +1525,75 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
 ; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw t0, 0(a1)
+; RV32I-NEXT:    lw t1, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw t0, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sltu a1, a2, a5
 ; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t2, t0, a3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, a5, .LBB22_2
+; RV32I-NEXT:    sub a7, a7, a1
+; RV32I-NEXT:    sltu a1, t0, a3
+; RV32I-NEXT:    mv t2, a1
+; RV32I-NEXT:    beq t1, a6, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a5
+; RV32I-NEXT:    sltu t2, t1, a6
 ; RV32I-NEXT:  .LBB22_2:
-; RV32I-NEXT:    sub t3, a2, a6
-; RV32I-NEXT:    sltu a6, t3, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t3, t1
-; RV32I-NEXT:    beq a6, a4, .LBB22_4
+; RV32I-NEXT:    sub t3, a2, a5
+; RV32I-NEXT:    sltu a5, t3, t2
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a7, t3, t2
+; RV32I-NEXT:    beq a5, a4, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:    sltu t2, a4, a5
 ; RV32I-NEXT:    j .LBB22_5
 ; RV32I-NEXT:  .LBB22_4:
-; RV32I-NEXT:    sltu t1, a2, a7
+; RV32I-NEXT:    sltu t2, a2, a7
 ; RV32I-NEXT:  .LBB22_5:
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a6, a6, a1
 ; RV32I-NEXT:    sub a3, t0, a3
-; RV32I-NEXT:    beq a5, a1, .LBB22_7
+; RV32I-NEXT:    beq a6, t1, .LBB22_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, a5
+; RV32I-NEXT:    sltu a1, t1, a6
 ; RV32I-NEXT:    j .LBB22_8
 ; RV32I-NEXT:  .LBB22_7:
 ; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB22_8:
-; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a4, a5, a4
 ; RV32I-NEXT:    xor a2, a7, a2
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    beqz a2, .LBB22_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    neg t0, a1
-; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    xor a4, a3, t0
-; RV32I-NEXT:    sltu a3, a2, t0
-; RV32I-NEXT:    add a7, a6, a1
-; RV32I-NEXT:    sltu a6, a4, t0
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    xor t1, a5, t0
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    beqz a5, .LBB22_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu a7, t1, t0
-; RV32I-NEXT:  .LBB22_12:
+; RV32I-NEXT:    xor a4, a7, t0
+; RV32I-NEXT:    xor a2, a5, t0
+; RV32I-NEXT:    xor a5, a6, t0
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a7, a4, t0
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    add t1, t1, a1
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    sltu a4, a2, a7
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a5, t1, a6
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a7, a3, t0
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beqz a6, .LBB22_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:  .LBB22_12:
+; RV32I-NEXT:    add a4, a4, a1
+; RV32I-NEXT:    add a5, a5, a1
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    sltu a3, a4, t1
+; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1621,75 +1621,75 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_cmp_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
 ; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 4(a1)
 ; RV32ZBB-NEXT:    lw a2, 8(a1)
 ; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sltu a1, a2, a5
 ; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t2, t0, a3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB22_2
+; RV32ZBB-NEXT:    sub a7, a7, a1
+; RV32ZBB-NEXT:    sltu a1, t0, a3
+; RV32ZBB-NEXT:    mv t2, a1
+; RV32ZBB-NEXT:    beq t1, a6, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a5
+; RV32ZBB-NEXT:    sltu t2, t1, a6
 ; RV32ZBB-NEXT:  .LBB22_2:
-; RV32ZBB-NEXT:    sub t3, a2, a6
-; RV32ZBB-NEXT:    sltu a6, t3, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t3, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB22_4
+; RV32ZBB-NEXT:    sub t3, a2, a5
+; RV32ZBB-NEXT:    sltu a5, t3, t2
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a7, t3, t2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a4, a6
+; RV32ZBB-NEXT:    sltu t2, a4, a5
 ; RV32ZBB-NEXT:    j .LBB22_5
 ; RV32ZBB-NEXT:  .LBB22_4:
-; RV32ZBB-NEXT:    sltu t1, a2, a7
+; RV32ZBB-NEXT:    sltu t2, a2, a7
 ; RV32ZBB-NEXT:  .LBB22_5:
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a6, a6, a1
 ; RV32ZBB-NEXT:    sub a3, t0, a3
-; RV32ZBB-NEXT:    beq a5, a1, .LBB22_7
+; RV32ZBB-NEXT:    beq a6, t1, .LBB22_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, a5
+; RV32ZBB-NEXT:    sltu a1, t1, a6
 ; RV32ZBB-NEXT:    j .LBB22_8
 ; RV32ZBB-NEXT:  .LBB22_7:
 ; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB22_8:
-; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a4, a5, a4
 ; RV32ZBB-NEXT:    xor a2, a7, a2
 ; RV32ZBB-NEXT:    or a2, a2, a4
 ; RV32ZBB-NEXT:    beqz a2, .LBB22_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    mv a1, t2
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    neg t0, a1
-; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    xor a4, a3, t0
-; RV32ZBB-NEXT:    sltu a3, a2, t0
-; RV32ZBB-NEXT:    add a7, a6, a1
-; RV32ZBB-NEXT:    sltu a6, a4, t0
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    xor t1, a5, t0
-; RV32ZBB-NEXT:    mv a7, a6
-; RV32ZBB-NEXT:    beqz a5, .LBB22_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu a7, t1, t0
-; RV32ZBB-NEXT:  .LBB22_12:
+; RV32ZBB-NEXT:    xor a4, a7, t0
+; RV32ZBB-NEXT:    xor a2, a5, t0
+; RV32ZBB-NEXT:    xor a5, a6, t0
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a7, a4, t0
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    add t1, t1, a1
-; RV32ZBB-NEXT:    add a1, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a5, t1, a6
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a7, a3, t0
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beqz a6, .LBB22_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:  .LBB22_12:
+; RV32ZBB-NEXT:    add a4, a4, a1
+; RV32ZBB-NEXT:    add a5, a5, a1
+; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a4, t1
+; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -1919,75 +1919,75 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_select_i128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    lw a5, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a5, 8(a2)
 ; RV32I-NEXT:    lw a7, 12(a2)
+; RV32I-NEXT:    lw t0, 0(a1)
+; RV32I-NEXT:    lw t1, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw t0, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a2, a6
+; RV32I-NEXT:    sltu a1, a2, a5
 ; RV32I-NEXT:    sub a7, a4, a7
-; RV32I-NEXT:    sltu t2, t0, a3
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    mv t1, t2
-; RV32I-NEXT:    beq a1, a5, .LBB27_2
+; RV32I-NEXT:    sub a7, a7, a1
+; RV32I-NEXT:    sltu a1, t0, a3
+; RV32I-NEXT:    mv t2, a1
+; RV32I-NEXT:    beq t1, a6, .LBB27_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a1, a5
+; RV32I-NEXT:    sltu t2, t1, a6
 ; RV32I-NEXT:  .LBB27_2:
-; RV32I-NEXT:    sub t3, a2, a6
-; RV32I-NEXT:    sltu a6, t3, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t3, t1
-; RV32I-NEXT:    beq a6, a4, .LBB27_4
+; RV32I-NEXT:    sub t3, a2, a5
+; RV32I-NEXT:    sltu a5, t3, t2
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a7, t3, t2
+; RV32I-NEXT:    beq a5, a4, .LBB27_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a4, a6
+; RV32I-NEXT:    sltu t2, a4, a5
 ; RV32I-NEXT:    j .LBB27_5
 ; RV32I-NEXT:  .LBB27_4:
-; RV32I-NEXT:    sltu t1, a2, a7
+; RV32I-NEXT:    sltu t2, a2, a7
 ; RV32I-NEXT:  .LBB27_5:
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a5, a5, t2
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sub a6, a6, a1
 ; RV32I-NEXT:    sub a3, t0, a3
-; RV32I-NEXT:    beq a5, a1, .LBB27_7
+; RV32I-NEXT:    beq a6, t1, .LBB27_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a1, a1, a5
+; RV32I-NEXT:    sltu a1, t1, a6
 ; RV32I-NEXT:    j .LBB27_8
 ; RV32I-NEXT:  .LBB27_7:
 ; RV32I-NEXT:    sltu a1, t0, a3
 ; RV32I-NEXT:  .LBB27_8:
-; RV32I-NEXT:    xor a4, a6, a4
+; RV32I-NEXT:    xor a4, a5, a4
 ; RV32I-NEXT:    xor a2, a7, a2
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    beqz a2, .LBB27_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:  .LBB27_10:
 ; RV32I-NEXT:    neg t0, a1
-; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    xor a4, a3, t0
-; RV32I-NEXT:    sltu a3, a2, t0
-; RV32I-NEXT:    add a7, a6, a1
-; RV32I-NEXT:    sltu a6, a4, t0
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    xor t1, a5, t0
-; RV32I-NEXT:    mv a7, a6
-; RV32I-NEXT:    beqz a5, .LBB27_12
-; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu a7, t1, t0
-; RV32I-NEXT:  .LBB27_12:
+; RV32I-NEXT:    xor a4, a7, t0
+; RV32I-NEXT:    xor a2, a5, t0
+; RV32I-NEXT:    xor a5, a6, t0
+; RV32I-NEXT:    xor a3, a3, t0
+; RV32I-NEXT:    sltu a7, a4, t0
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    add t1, t1, a1
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    sltu a4, a2, a7
 ; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a5, t1, a6
-; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sltu a7, a3, t0
+; RV32I-NEXT:    mv t1, a7
+; RV32I-NEXT:    beqz a6, .LBB27_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sltu t1, a5, t0
+; RV32I-NEXT:  .LBB27_12:
+; RV32I-NEXT:    add a4, a4, a1
+; RV32I-NEXT:    add a5, a5, a1
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    sltu a3, a4, t1
+; RV32I-NEXT:    sub a4, a4, t1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_select_i128:
@@ -2015,75 +2015,75 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-LABEL: abd_select_i128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
-; RV32ZBB-NEXT:    lw a5, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 4(a2)
+; RV32ZBB-NEXT:    lw a5, 8(a2)
 ; RV32ZBB-NEXT:    lw a7, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 0(a1)
+; RV32ZBB-NEXT:    lw t1, 4(a1)
 ; RV32ZBB-NEXT:    lw a2, 8(a1)
 ; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw t0, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a2, a6
+; RV32ZBB-NEXT:    sltu a1, a2, a5
 ; RV32ZBB-NEXT:    sub a7, a4, a7
-; RV32ZBB-NEXT:    sltu t2, t0, a3
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    mv t1, t2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB27_2
+; RV32ZBB-NEXT:    sub a7, a7, a1
+; RV32ZBB-NEXT:    sltu a1, t0, a3
+; RV32ZBB-NEXT:    mv t2, a1
+; RV32ZBB-NEXT:    beq t1, a6, .LBB27_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a1, a5
+; RV32ZBB-NEXT:    sltu t2, t1, a6
 ; RV32ZBB-NEXT:  .LBB27_2:
-; RV32ZBB-NEXT:    sub t3, a2, a6
-; RV32ZBB-NEXT:    sltu a6, t3, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t3, t1
-; RV32ZBB-NEXT:    beq a6, a4, .LBB27_4
+; RV32ZBB-NEXT:    sub t3, a2, a5
+; RV32ZBB-NEXT:    sltu a5, t3, t2
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a7, t3, t2
+; RV32ZBB-NEXT:    beq a5, a4, .LBB27_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a4, a6
+; RV32ZBB-NEXT:    sltu t2, a4, a5
 ; RV32ZBB-NEXT:    j .LBB27_5
 ; RV32ZBB-NEXT:  .LBB27_4:
-; RV32ZBB-NEXT:    sltu t1, a2, a7
+; RV32ZBB-NEXT:    sltu t2, a2, a7
 ; RV32ZBB-NEXT:  .LBB27_5:
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a5, a5, t2
+; RV32ZBB-NEXT:    sub a6, t1, a6
+; RV32ZBB-NEXT:    sub a6, a6, a1
 ; RV32ZBB-NEXT:    sub a3, t0, a3
-; RV32ZBB-NEXT:    beq a5, a1, .LBB27_7
+; RV32ZBB-NEXT:    beq a6, t1, .LBB27_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a1, a1, a5
+; RV32ZBB-NEXT:    sltu a1, t1, a6
 ; RV32ZBB-NEXT:    j .LBB27_8
 ; RV32ZBB-NEXT:  .LBB27_7:
 ; RV32ZBB-NEXT:    sltu a1, t0, a3
 ; RV32ZBB-NEXT:  .LBB27_8:
-; RV32ZBB-NEXT:    xor a4, a6, a4
+; RV32ZBB-NEXT:    xor a4, a5, a4
 ; RV32ZBB-NEXT:    xor a2, a7, a2
 ; RV32ZBB-NEXT:    or a2, a2, a4
 ; RV32ZBB-NEXT:    beqz a2, .LBB27_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a1, t1
+; RV32ZBB-NEXT:    mv a1, t2
 ; RV32ZBB-NEXT:  .LBB27_10:
 ; RV32ZBB-NEXT:    neg t0, a1
-; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    xor a4, a3, t0
-; RV32ZBB-NEXT:    sltu a3, a2, t0
-; RV32ZBB-NEXT:    add a7, a6, a1
-; RV32ZBB-NEXT:    sltu a6, a4, t0
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    xor t1, a5, t0
-; RV32ZBB-NEXT:    mv a7, a6
-; RV32ZBB-NEXT:    beqz a5, .LBB27_12
-; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu a7, t1, t0
-; RV32ZBB-NEXT:  .LBB27_12:
+; RV32ZBB-NEXT:    xor a4, a7, t0
+; RV32ZBB-NEXT:    xor a2, a5, t0
+; RV32ZBB-NEXT:    xor a5, a6, t0
+; RV32ZBB-NEXT:    xor a3, a3, t0
+; RV32ZBB-NEXT:    sltu a7, a4, t0
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    add t1, t1, a1
-; RV32ZBB-NEXT:    add a1, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a5, t1, a6
-; RV32ZBB-NEXT:    sub a3, a3, a4
+; RV32ZBB-NEXT:    sltu a7, a3, t0
+; RV32ZBB-NEXT:    mv t1, a7
+; RV32ZBB-NEXT:    beqz a6, .LBB27_12
+; RV32ZBB-NEXT:  # %bb.11:
+; RV32ZBB-NEXT:    sltu t1, a5, t0
+; RV32ZBB-NEXT:  .LBB27_12:
+; RV32ZBB-NEXT:    add a4, a4, a1
+; RV32ZBB-NEXT:    add a5, a5, a1
+; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a4, t1
+; RV32ZBB-NEXT:    sub a4, a4, t1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 5d4478f9d4b5f..533482e9fdeb4 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    c.lw a4, 12(a1)
-; RV32C-NEXT:    c.lw a3, 0(a1)
+; RV32C-NEXT:    lw a6, 0(a1)
 ; RV32C-NEXT:    c.lw a2, 4(a1)
-; RV32C-NEXT:    c.lw a1, 8(a1)
+; RV32C-NEXT:    c.lw a4, 8(a1)
+; RV32C-NEXT:    c.lw a1, 12(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    add a6, a4, a5
-; RV32C-NEXT:    srli a5, a3, 29
-; RV32C-NEXT:    slli a4, a2, 3
-; RV32C-NEXT:    c.or a4, a5
-; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    c.add a1, a5
+; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    slli a3, a2, 3
+; RV32C-NEXT:    c.or a3, a5
+; RV32C-NEXT:    srli a5, a4, 29
 ; RV32C-NEXT:    c.srli a2, 29
-; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.slli a3, 3
+; RV32C-NEXT:    c.slli a4, 3
 ; RV32C-NEXT:    c.slli a6, 3
-; RV32C-NEXT:    c.or a1, a2
-; RV32C-NEXT:    or a2, a6, a5
-; RV32C-NEXT:    c.sw a3, 0(a0)
-; RV32C-NEXT:    c.sw a4, 4(a0)
-; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 12(a0)
+; RV32C-NEXT:    c.slli a1, 3
+; RV32C-NEXT:    c.or a2, a4
+; RV32C-NEXT:    c.or a1, a5
+; RV32C-NEXT:    sw a6, 0(a0)
+; RV32C-NEXT:    c.sw a3, 4(a0)
+; RV32C-NEXT:    c.sw a2, 8(a0)
+; RV32C-NEXT:    c.sw a1, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
 ; RV64C-LABEL: add_wide_operand:
diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll
index 84deb4c00ac8d..21597beb0c483 100644
--- a/llvm/test/CodeGen/RISCV/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/add-imm.ll
@@ -214,28 +214,28 @@ define void @add32_reject() nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(ga)
 ; RV32I-NEXT:    lui a1, %hi(gb)
-; RV32I-NEXT:    lw a2, %lo(ga)(a0)
-; RV32I-NEXT:    lw a3, %lo(gb)(a1)
-; RV32I-NEXT:    lui a4, 1
-; RV32I-NEXT:    addi a4, a4, -1096
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    sw a2, %lo(ga)(a0)
-; RV32I-NEXT:    sw a3, %lo(gb)(a1)
+; RV32I-NEXT:    lui a2, 1
+; RV32I-NEXT:    lw a3, %lo(ga)(a0)
+; RV32I-NEXT:    lw a4, %lo(gb)(a1)
+; RV32I-NEXT:    addi a2, a2, -1096
+; RV32I-NEXT:    add a3, a3, a2
+; RV32I-NEXT:    add a2, a4, a2
+; RV32I-NEXT:    sw a3, %lo(ga)(a0)
+; RV32I-NEXT:    sw a2, %lo(gb)(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add32_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(ga)
 ; RV64I-NEXT:    lui a1, %hi(gb)
-; RV64I-NEXT:    lw a2, %lo(ga)(a0)
-; RV64I-NEXT:    lw a3, %lo(gb)(a1)
-; RV64I-NEXT:    lui a4, 1
-; RV64I-NEXT:    addi a4, a4, -1096
-; RV64I-NEXT:    add a2, a2, a4
-; RV64I-NEXT:    add a3, a3, a4
-; RV64I-NEXT:    sw a2, %lo(ga)(a0)
-; RV64I-NEXT:    sw a3, %lo(gb)(a1)
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:    lw a3, %lo(ga)(a0)
+; RV64I-NEXT:    lw a4, %lo(gb)(a1)
+; RV64I-NEXT:    addi a2, a2, -1096
+; RV64I-NEXT:    add a3, a3, a2
+; RV64I-NEXT:    add a2, a4, a2
+; RV64I-NEXT:    sw a3, %lo(ga)(a0)
+; RV64I-NEXT:    sw a2, %lo(gb)(a1)
 ; RV64I-NEXT:    ret
   %1 = load i32, ptr @ga, align 4
   %2 = load i32, ptr @gb, align 4
diff --git a/llvm/test/CodeGen/RISCV/alloca.ll b/llvm/test/CodeGen/RISCV/alloca.ll
index 975fc93c830af..2463cd229ee7d 100644
--- a/llvm/test/CodeGen/RISCV/alloca.ll
+++ b/llvm/test/CodeGen/RISCV/alloca.ll
@@ -76,21 +76,21 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; RV32I-NEXT:    sub a0, sp, a0
 ; RV32I-NEXT:    mv sp, a0
 ; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    li t0, 12
-; RV32I-NEXT:    li t1, 11
-; RV32I-NEXT:    li t2, 10
-; RV32I-NEXT:    li t3, 9
+; RV32I-NEXT:    li a7, 12
+; RV32I-NEXT:    li t0, 11
+; RV32I-NEXT:    li t1, 10
+; RV32I-NEXT:    li t2, 9
 ; RV32I-NEXT:    li a1, 2
 ; RV32I-NEXT:    li a2, 3
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 6
 ; RV32I-NEXT:    li a6, 7
+; RV32I-NEXT:    sw t2, 0(sp)
+; RV32I-NEXT:    sw t1, 4(sp)
+; RV32I-NEXT:    sw t0, 8(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
 ; RV32I-NEXT:    li a7, 8
-; RV32I-NEXT:    sw t3, 0(sp)
-; RV32I-NEXT:    sw t2, 4(sp)
-; RV32I-NEXT:    sw t1, 8(sp)
-; RV32I-NEXT:    sw t0, 12(sp)
 ; RV32I-NEXT:    call func
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    addi sp, s0, -16
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index f032756e007b6..8d393e894e69d 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -206,8 +206,8 @@ define i64 @sll(i64 %a, i64 %b) nounwind {
 ;
 ; RV32I-LABEL: sll:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    sll a3, a0, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a1, a3
@@ -293,8 +293,8 @@ define i64 @srl(i64 %a, i64 %b) nounwind {
 ;
 ; RV32I-LABEL: srl:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a3
@@ -322,13 +322,12 @@ define i64 @sra(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB16_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a3, a3, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    srai a1, a3, 31
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB16_2:
 ; RV32I-NEXT:    srl a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
index 8534ad379ebab..4abc125ce58eb 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
@@ -192,41 +192,41 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a2
-; RV32-NEXT:    mv s1, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    j .LBB11_2
 ; RV32-NEXT:  .LBB11_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sw a4, 8(sp)
-; RV32-NEXT:    sw a5, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a4, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a0, s2
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lw a5, 12(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a4, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB11_6
 ; RV32-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s0, .LBB11_4
+; RV32-NEXT:    beq a4, s0, .LBB11_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    slt a0, s0, a5
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    slt a0, s0, a4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:    j .LBB11_5
 ; RV32-NEXT:  .LBB11_4: # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a4
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s1, a1
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    bnez a0, .LBB11_1
 ; RV32-NEXT:  .LBB11_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    mv a2, s2
+; RV32-NEXT:    mv a2, s1
 ; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB11_1
 ; RV32-NEXT:  .LBB11_6: # %atomicrmw.end
@@ -268,41 +268,41 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a2
-; RV32-NEXT:    mv s1, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    j .LBB13_2
 ; RV32-NEXT:  .LBB13_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sw a4, 8(sp)
-; RV32-NEXT:    sw a5, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a4, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a0, s2
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lw a5, 12(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a4, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_6
 ; RV32-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s0, .LBB13_4
+; RV32-NEXT:    beq a4, s0, .LBB13_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s0, a5
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s0, a4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:    j .LBB13_5
 ; RV32-NEXT:  .LBB13_4: # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a4
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s1, a1
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    bnez a0, .LBB13_1
 ; RV32-NEXT:  .LBB13_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    mv a2, s2
+; RV32-NEXT:    mv a2, s1
 ; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB13_1
 ; RV32-NEXT:  .LBB13_6: # %atomicrmw.end
@@ -344,41 +344,41 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a2
-; RV32-NEXT:    mv s1, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    j .LBB15_2
 ; RV32-NEXT:  .LBB15_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sw a4, 8(sp)
-; RV32-NEXT:    sw a5, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a4, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a0, s2
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lw a5, 12(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a4, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_6
 ; RV32-NEXT:  .LBB15_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s0, .LBB15_4
+; RV32-NEXT:    beq a4, s0, .LBB15_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    slt a0, s0, a5
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    slt a0, s0, a4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:    j .LBB15_5
 ; RV32-NEXT:  .LBB15_4: # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a4
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s1, a1
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    beqz a0, .LBB15_1
 ; RV32-NEXT:  .LBB15_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    mv a2, s2
+; RV32-NEXT:    mv a2, s1
 ; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB15_1
 ; RV32-NEXT:  .LBB15_6: # %atomicrmw.end
@@ -420,41 +420,41 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a2
-; RV32-NEXT:    mv s1, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s2, a1
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    j .LBB17_2
 ; RV32-NEXT:  .LBB17_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sw a4, 8(sp)
-; RV32-NEXT:    sw a5, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a4, 12(sp)
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a0, s2
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 8(sp)
-; RV32-NEXT:    lw a5, 12(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a4, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB17_6
 ; RV32-NEXT:  .LBB17_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s0, .LBB17_4
+; RV32-NEXT:    beq a4, s0, .LBB17_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s0, a5
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s0, a4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:    j .LBB17_5
 ; RV32-NEXT:  .LBB17_4: # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s2, a4
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    mv a3, a5
+; RV32-NEXT:    sltu a0, s1, a1
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a3, a4
 ; RV32-NEXT:    beqz a0, .LBB17_1
 ; RV32-NEXT:  .LBB17_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    mv a2, s2
+; RV32-NEXT:    mv a2, s1
 ; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB17_1
 ; RV32-NEXT:  .LBB17_6: # %atomicrmw.end
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 81518541477a8..95cd49ff9611d 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -5352,34 +5352,34 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB45_2
 ; RV32I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB45_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB45_4
 ; RV32I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB45_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB45_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB45_1
 ; RV32I-NEXT:  .LBB45_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -5423,34 +5423,34 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB45_2
 ; RV64I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB45_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB45_4
 ; RV64I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB45_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB45_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB45_1
 ; RV64I-NEXT:  .LBB45_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -5537,34 +5537,34 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB46_2
 ; RV32I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB46_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB46_4
 ; RV32I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB46_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB46_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB46_1
 ; RV32I-NEXT:  .LBB46_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -5637,34 +5637,34 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB46_2
 ; RV64I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB46_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB46_4
 ; RV64I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB46_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB46_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB46_1
 ; RV64I-NEXT:  .LBB46_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -5809,34 +5809,34 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB47_2
 ; RV32I-NEXT:  .LBB47_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB47_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB47_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB47_4
 ; RV32I-NEXT:  .LBB47_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB47_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB47_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB47_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB47_1
 ; RV32I-NEXT:  .LBB47_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -5909,34 +5909,34 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB47_2
 ; RV64I-NEXT:  .LBB47_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB47_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB47_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB47_4
 ; RV64I-NEXT:  .LBB47_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB47_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB47_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB47_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB47_1
 ; RV64I-NEXT:  .LBB47_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -6081,34 +6081,34 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB48_2
 ; RV32I-NEXT:  .LBB48_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB48_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB48_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB48_4
 ; RV32I-NEXT:  .LBB48_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB48_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB48_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB48_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB48_1
 ; RV32I-NEXT:  .LBB48_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -6181,34 +6181,34 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB48_2
 ; RV64I-NEXT:  .LBB48_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB48_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB48_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB48_4
 ; RV64I-NEXT:  .LBB48_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB48_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB48_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB48_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB48_1
 ; RV64I-NEXT:  .LBB48_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -6353,34 +6353,34 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB49_2
 ; RV32I-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB49_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB49_4
 ; RV32I-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB49_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB49_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB49_1
 ; RV32I-NEXT:  .LBB49_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -6424,34 +6424,34 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB49_2
 ; RV64I-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB49_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB49_4
 ; RV64I-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB49_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB49_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB49_1
 ; RV64I-NEXT:  .LBB49_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -6538,34 +6538,34 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB50_2
 ; RV32I-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB50_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB50_4
 ; RV32I-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB50_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB50_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB50_1
 ; RV32I-NEXT:  .LBB50_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -6609,34 +6609,34 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB50_2
 ; RV64I-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB50_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB50_4
 ; RV64I-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB50_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB50_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB50_1
 ; RV64I-NEXT:  .LBB50_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -6723,34 +6723,34 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB51_2
 ; RV32I-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB51_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB51_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB51_4
 ; RV32I-NEXT:  .LBB51_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB51_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB51_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB51_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB51_1
 ; RV32I-NEXT:  .LBB51_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -6823,34 +6823,34 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB51_2
 ; RV64I-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB51_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB51_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB51_4
 ; RV64I-NEXT:  .LBB51_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB51_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB51_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB51_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB51_1
 ; RV64I-NEXT:  .LBB51_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -6995,34 +6995,34 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB52_2
 ; RV32I-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB52_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB52_4
 ; RV32I-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB52_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB52_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB52_1
 ; RV32I-NEXT:  .LBB52_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -7095,34 +7095,34 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB52_2
 ; RV64I-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB52_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB52_4
 ; RV64I-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB52_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB52_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB52_1
 ; RV64I-NEXT:  .LBB52_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -7267,34 +7267,34 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB53_2
 ; RV32I-NEXT:  .LBB53_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB53_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB53_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB53_4
 ; RV32I-NEXT:  .LBB53_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB53_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB53_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB53_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB53_1
 ; RV32I-NEXT:  .LBB53_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -7367,34 +7367,34 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB53_2
 ; RV64I-NEXT:  .LBB53_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB53_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB53_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB53_4
 ; RV64I-NEXT:  .LBB53_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB53_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB53_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB53_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB53_1
 ; RV64I-NEXT:  .LBB53_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -7539,34 +7539,34 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
-; RV32I-NEXT:    srai s2, a0, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai s2, a1, 24
 ; RV32I-NEXT:    j .LBB54_2
 ; RV32I-NEXT:  .LBB54_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB54_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB54_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB54_4
 ; RV32I-NEXT:  .LBB54_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
-; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB54_1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB54_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB54_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB54_1
 ; RV32I-NEXT:  .LBB54_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -7610,34 +7610,34 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
-; RV64I-NEXT:    srai s2, a0, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai s2, a1, 56
 ; RV64I-NEXT:    j .LBB54_2
 ; RV64I-NEXT:  .LBB54_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB54_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB54_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB54_4
 ; RV64I-NEXT:  .LBB54_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
-; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB54_1
+; RV64I-NEXT:    slli a1, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB54_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB54_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB54_1
 ; RV64I-NEXT:  .LBB54_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -7724,32 +7724,32 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB55_2
 ; RV32I-NEXT:  .LBB55_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB55_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB55_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB55_4
 ; RV32I-NEXT:  .LBB55_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s2, a0, .LBB55_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s2, a1, .LBB55_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB55_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB55_1
 ; RV32I-NEXT:  .LBB55_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -7788,32 +7788,32 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB55_2
 ; RV64I-NEXT:  .LBB55_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB55_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB55_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB55_4
 ; RV64I-NEXT:  .LBB55_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a0, .LBB55_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a1, .LBB55_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB55_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB55_1
 ; RV64I-NEXT:  .LBB55_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -7890,32 +7890,32 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB56_2
 ; RV32I-NEXT:  .LBB56_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB56_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB56_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB56_4
 ; RV32I-NEXT:  .LBB56_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s2, a0, .LBB56_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s2, a1, .LBB56_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB56_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB56_1
 ; RV32I-NEXT:  .LBB56_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -7978,32 +7978,32 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB56_2
 ; RV64I-NEXT:  .LBB56_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB56_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB56_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB56_4
 ; RV64I-NEXT:  .LBB56_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a0, .LBB56_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a1, .LBB56_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB56_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB56_1
 ; RV64I-NEXT:  .LBB56_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -8128,32 +8128,32 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB57_2
 ; RV32I-NEXT:  .LBB57_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB57_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB57_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB57_4
 ; RV32I-NEXT:  .LBB57_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s2, a0, .LBB57_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s2, a1, .LBB57_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB57_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB57_1
 ; RV32I-NEXT:  .LBB57_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -8216,32 +8216,32 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB57_2
 ; RV64I-NEXT:  .LBB57_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB57_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB57_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB57_4
 ; RV64I-NEXT:  .LBB57_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a0, .LBB57_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a1, .LBB57_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB57_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB57_1
 ; RV64I-NEXT:  .LBB57_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -8366,32 +8366,32 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB58_2
 ; RV32I-NEXT:  .LBB58_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB58_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB58_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB58_4
 ; RV32I-NEXT:  .LBB58_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s2, a0, .LBB58_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s2, a1, .LBB58_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB58_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB58_1
 ; RV32I-NEXT:  .LBB58_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -8454,32 +8454,32 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB58_2
 ; RV64I-NEXT:  .LBB58_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB58_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB58_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB58_4
 ; RV64I-NEXT:  .LBB58_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a0, .LBB58_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a1, .LBB58_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB58_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB58_1
 ; RV64I-NEXT:  .LBB58_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -8604,32 +8604,32 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB59_2
 ; RV32I-NEXT:  .LBB59_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB59_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB59_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB59_4
 ; RV32I-NEXT:  .LBB59_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s2, a0, .LBB59_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s2, a1, .LBB59_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB59_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB59_1
 ; RV32I-NEXT:  .LBB59_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -8668,32 +8668,32 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB59_2
 ; RV64I-NEXT:  .LBB59_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB59_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB59_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB59_4
 ; RV64I-NEXT:  .LBB59_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a0, .LBB59_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a1, .LBB59_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB59_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB59_1
 ; RV64I-NEXT:  .LBB59_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -8770,32 +8770,32 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB60_2
 ; RV32I-NEXT:  .LBB60_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB60_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB60_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB60_4
 ; RV32I-NEXT:  .LBB60_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s2, a0, .LBB60_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s2, a1, .LBB60_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB60_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB60_1
 ; RV32I-NEXT:  .LBB60_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -8834,32 +8834,32 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB60_2
 ; RV64I-NEXT:  .LBB60_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB60_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB60_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB60_4
 ; RV64I-NEXT:  .LBB60_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a0, .LBB60_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a1, .LBB60_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB60_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB60_1
 ; RV64I-NEXT:  .LBB60_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -8936,32 +8936,32 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB61_2
 ; RV32I-NEXT:  .LBB61_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB61_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB61_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB61_4
 ; RV32I-NEXT:  .LBB61_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s2, a0, .LBB61_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s2, a1, .LBB61_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB61_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB61_1
 ; RV32I-NEXT:  .LBB61_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -9024,32 +9024,32 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB61_2
 ; RV64I-NEXT:  .LBB61_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB61_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB61_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB61_4
 ; RV64I-NEXT:  .LBB61_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a0, .LBB61_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a1, .LBB61_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB61_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB61_1
 ; RV64I-NEXT:  .LBB61_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -9174,32 +9174,32 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB62_2
 ; RV32I-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB62_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB62_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB62_4
 ; RV32I-NEXT:  .LBB62_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s2, a0, .LBB62_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s2, a1, .LBB62_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB62_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB62_1
 ; RV32I-NEXT:  .LBB62_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -9262,32 +9262,32 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB62_2
 ; RV64I-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB62_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB62_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB62_4
 ; RV64I-NEXT:  .LBB62_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a0, .LBB62_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a1, .LBB62_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB62_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB62_1
 ; RV64I-NEXT:  .LBB62_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -9412,32 +9412,32 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB63_2
 ; RV32I-NEXT:  .LBB63_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB63_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB63_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB63_4
 ; RV32I-NEXT:  .LBB63_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s2, a0, .LBB63_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s2, a1, .LBB63_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB63_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB63_1
 ; RV32I-NEXT:  .LBB63_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -9500,32 +9500,32 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB63_2
 ; RV64I-NEXT:  .LBB63_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB63_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB63_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB63_4
 ; RV64I-NEXT:  .LBB63_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a0, .LBB63_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a1, .LBB63_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB63_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB63_1
 ; RV64I-NEXT:  .LBB63_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -9650,32 +9650,32 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB64_2
 ; RV32I-NEXT:  .LBB64_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB64_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB64_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB64_4
 ; RV32I-NEXT:  .LBB64_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s2, a0, .LBB64_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s2, a1, .LBB64_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB64_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB64_1
 ; RV32I-NEXT:  .LBB64_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -9714,32 +9714,32 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB64_2
 ; RV64I-NEXT:  .LBB64_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB64_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB64_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB64_4
 ; RV64I-NEXT:  .LBB64_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a0, .LBB64_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a1, .LBB64_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB64_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB64_1
 ; RV64I-NEXT:  .LBB64_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -15381,34 +15381,34 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB110_2
 ; RV32I-NEXT:  .LBB110_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB110_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB110_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB110_4
 ; RV32I-NEXT:  .LBB110_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB110_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB110_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB110_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB110_1
 ; RV32I-NEXT:  .LBB110_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -15454,34 +15454,34 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB110_2
 ; RV64I-NEXT:  .LBB110_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB110_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB110_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB110_4
 ; RV64I-NEXT:  .LBB110_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB110_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB110_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB110_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB110_1
 ; RV64I-NEXT:  .LBB110_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -15572,34 +15572,34 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB111_2
 ; RV32I-NEXT:  .LBB111_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB111_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB111_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB111_4
 ; RV32I-NEXT:  .LBB111_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB111_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB111_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB111_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB111_1
 ; RV32I-NEXT:  .LBB111_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -15676,34 +15676,34 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB111_2
 ; RV64I-NEXT:  .LBB111_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB111_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB111_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB111_4
 ; RV64I-NEXT:  .LBB111_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB111_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB111_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB111_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB111_1
 ; RV64I-NEXT:  .LBB111_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -15856,34 +15856,34 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB112_2
 ; RV32I-NEXT:  .LBB112_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB112_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB112_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB112_4
 ; RV32I-NEXT:  .LBB112_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB112_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB112_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB112_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB112_1
 ; RV32I-NEXT:  .LBB112_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -15960,34 +15960,34 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB112_2
 ; RV64I-NEXT:  .LBB112_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB112_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB112_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB112_4
 ; RV64I-NEXT:  .LBB112_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB112_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB112_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB112_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB112_1
 ; RV64I-NEXT:  .LBB112_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -16140,34 +16140,34 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB113_2
 ; RV32I-NEXT:  .LBB113_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB113_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB113_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB113_4
 ; RV32I-NEXT:  .LBB113_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB113_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB113_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB113_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB113_1
 ; RV32I-NEXT:  .LBB113_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -16244,34 +16244,34 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB113_2
 ; RV64I-NEXT:  .LBB113_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB113_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB113_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB113_4
 ; RV64I-NEXT:  .LBB113_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB113_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB113_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB113_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB113_1
 ; RV64I-NEXT:  .LBB113_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -16424,34 +16424,34 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB114_2
 ; RV32I-NEXT:  .LBB114_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB114_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB114_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB114_4
 ; RV32I-NEXT:  .LBB114_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s2, a0, .LBB114_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s2, a1, .LBB114_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB114_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB114_1
 ; RV32I-NEXT:  .LBB114_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -16497,34 +16497,34 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB114_2
 ; RV64I-NEXT:  .LBB114_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB114_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB114_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB114_4
 ; RV64I-NEXT:  .LBB114_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a0, .LBB114_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a1, .LBB114_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB114_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB114_1
 ; RV64I-NEXT:  .LBB114_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -16615,34 +16615,34 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB115_2
 ; RV32I-NEXT:  .LBB115_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB115_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB115_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB115_4
 ; RV32I-NEXT:  .LBB115_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB115_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB115_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB115_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB115_1
 ; RV32I-NEXT:  .LBB115_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -16688,34 +16688,34 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB115_2
 ; RV64I-NEXT:  .LBB115_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB115_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB115_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB115_4
 ; RV64I-NEXT:  .LBB115_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB115_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB115_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB115_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB115_1
 ; RV64I-NEXT:  .LBB115_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -16806,34 +16806,34 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB116_2
 ; RV32I-NEXT:  .LBB116_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB116_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB116_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB116_4
 ; RV32I-NEXT:  .LBB116_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB116_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB116_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB116_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB116_1
 ; RV32I-NEXT:  .LBB116_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -16910,34 +16910,34 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB116_2
 ; RV64I-NEXT:  .LBB116_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB116_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB116_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB116_4
 ; RV64I-NEXT:  .LBB116_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB116_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB116_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB116_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB116_1
 ; RV64I-NEXT:  .LBB116_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -17090,34 +17090,34 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB117_2
 ; RV32I-NEXT:  .LBB117_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB117_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB117_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB117_4
 ; RV32I-NEXT:  .LBB117_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB117_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB117_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB117_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB117_1
 ; RV32I-NEXT:  .LBB117_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -17194,34 +17194,34 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB117_2
 ; RV64I-NEXT:  .LBB117_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB117_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB117_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB117_4
 ; RV64I-NEXT:  .LBB117_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB117_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB117_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB117_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB117_1
 ; RV64I-NEXT:  .LBB117_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -17374,34 +17374,34 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB118_2
 ; RV32I-NEXT:  .LBB118_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB118_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB118_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB118_4
 ; RV32I-NEXT:  .LBB118_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB118_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB118_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB118_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB118_1
 ; RV32I-NEXT:  .LBB118_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -17478,34 +17478,34 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB118_2
 ; RV64I-NEXT:  .LBB118_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB118_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB118_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB118_4
 ; RV64I-NEXT:  .LBB118_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB118_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB118_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB118_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB118_1
 ; RV64I-NEXT:  .LBB118_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -17658,34 +17658,34 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    srai s2, a0, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a0, 0(a0)
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai s2, a1, 16
 ; RV32I-NEXT:    j .LBB119_2
 ; RV32I-NEXT:  .LBB119_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB119_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    bnez a0, .LBB119_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    bnez a1, .LBB119_4
 ; RV32I-NEXT:  .LBB119_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
-; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s2, a0, .LBB119_1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s2, a1, .LBB119_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB119_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB119_1
 ; RV32I-NEXT:  .LBB119_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -17731,34 +17731,34 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
-; RV64I-NEXT:    srai s2, a0, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai s2, a1, 48
 ; RV64I-NEXT:    j .LBB119_2
 ; RV64I-NEXT:  .LBB119_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB119_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    bnez a0, .LBB119_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    bnez a1, .LBB119_4
 ; RV64I-NEXT:  .LBB119_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a0, .LBB119_1
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a1, .LBB119_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB119_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB119_1
 ; RV64I-NEXT:  .LBB119_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -17852,32 +17852,32 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB120_2
 ; RV32I-NEXT:  .LBB120_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB120_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB120_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB120_4
 ; RV32I-NEXT:  .LBB120_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu s3, a0, .LBB120_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s3, a1, .LBB120_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB120_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB120_1
 ; RV32I-NEXT:  .LBB120_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -17921,32 +17921,32 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB120_2
 ; RV64I-NEXT:  .LBB120_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB120_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB120_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB120_4
 ; RV64I-NEXT:  .LBB120_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu s3, a0, .LBB120_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s3, a1, .LBB120_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB120_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB120_1
 ; RV64I-NEXT:  .LBB120_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -18029,32 +18029,32 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB121_2
 ; RV32I-NEXT:  .LBB121_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB121_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB121_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB121_4
 ; RV32I-NEXT:  .LBB121_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu s3, a0, .LBB121_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s3, a1, .LBB121_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB121_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB121_1
 ; RV32I-NEXT:  .LBB121_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -18123,32 +18123,32 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB121_2
 ; RV64I-NEXT:  .LBB121_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB121_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB121_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB121_4
 ; RV64I-NEXT:  .LBB121_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu s3, a0, .LBB121_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s3, a1, .LBB121_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB121_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB121_1
 ; RV64I-NEXT:  .LBB121_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -18281,32 +18281,32 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB122_2
 ; RV32I-NEXT:  .LBB122_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB122_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 3
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB122_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB122_4
 ; RV32I-NEXT:  .LBB122_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu s3, a0, .LBB122_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s3, a1, .LBB122_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB122_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB122_1
 ; RV32I-NEXT:  .LBB122_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -18375,32 +18375,32 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB122_2
 ; RV64I-NEXT:  .LBB122_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB122_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 3
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB122_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB122_4
 ; RV64I-NEXT:  .LBB122_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu s3, a0, .LBB122_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s3, a1, .LBB122_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB122_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB122_1
 ; RV64I-NEXT:  .LBB122_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -18533,32 +18533,32 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB123_2
 ; RV32I-NEXT:  .LBB123_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB123_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB123_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB123_4
 ; RV32I-NEXT:  .LBB123_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu s3, a0, .LBB123_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s3, a1, .LBB123_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB123_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB123_1
 ; RV32I-NEXT:  .LBB123_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -18627,32 +18627,32 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB123_2
 ; RV64I-NEXT:  .LBB123_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB123_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB123_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB123_4
 ; RV64I-NEXT:  .LBB123_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu s3, a0, .LBB123_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s3, a1, .LBB123_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB123_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB123_1
 ; RV64I-NEXT:  .LBB123_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -18785,32 +18785,32 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB124_2
 ; RV32I-NEXT:  .LBB124_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB124_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB124_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB124_4
 ; RV32I-NEXT:  .LBB124_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu s3, a0, .LBB124_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s3, a1, .LBB124_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB124_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB124_1
 ; RV32I-NEXT:  .LBB124_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -18854,32 +18854,32 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB124_2
 ; RV64I-NEXT:  .LBB124_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB124_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB124_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB124_4
 ; RV64I-NEXT:  .LBB124_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu s3, a0, .LBB124_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s3, a1, .LBB124_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB124_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB124_1
 ; RV64I-NEXT:  .LBB124_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -18962,32 +18962,32 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB125_2
 ; RV32I-NEXT:  .LBB125_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB125_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB125_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB125_4
 ; RV32I-NEXT:  .LBB125_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgeu s3, a0, .LBB125_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s3, a1, .LBB125_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB125_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB125_1
 ; RV32I-NEXT:  .LBB125_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19031,32 +19031,32 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB125_2
 ; RV64I-NEXT:  .LBB125_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB125_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB125_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB125_4
 ; RV64I-NEXT:  .LBB125_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bgeu s3, a0, .LBB125_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s3, a1, .LBB125_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB125_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB125_1
 ; RV64I-NEXT:  .LBB125_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -19139,32 +19139,32 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB126_2
 ; RV32I-NEXT:  .LBB126_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB126_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB126_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB126_4
 ; RV32I-NEXT:  .LBB126_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgeu s3, a0, .LBB126_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s3, a1, .LBB126_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB126_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB126_1
 ; RV32I-NEXT:  .LBB126_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19233,32 +19233,32 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB126_2
 ; RV64I-NEXT:  .LBB126_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB126_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB126_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB126_4
 ; RV64I-NEXT:  .LBB126_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bgeu s3, a0, .LBB126_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s3, a1, .LBB126_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB126_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB126_1
 ; RV64I-NEXT:  .LBB126_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -19391,32 +19391,32 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB127_2
 ; RV32I-NEXT:  .LBB127_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB127_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 3
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB127_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB127_4
 ; RV32I-NEXT:  .LBB127_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgeu s3, a0, .LBB127_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s3, a1, .LBB127_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB127_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB127_1
 ; RV32I-NEXT:  .LBB127_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19485,32 +19485,32 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB127_2
 ; RV64I-NEXT:  .LBB127_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB127_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 3
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB127_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB127_4
 ; RV64I-NEXT:  .LBB127_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bgeu s3, a0, .LBB127_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s3, a1, .LBB127_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB127_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB127_1
 ; RV64I-NEXT:  .LBB127_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -19643,32 +19643,32 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB128_2
 ; RV32I-NEXT:  .LBB128_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB128_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB128_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB128_4
 ; RV32I-NEXT:  .LBB128_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgeu s3, a0, .LBB128_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s3, a1, .LBB128_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB128_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB128_1
 ; RV32I-NEXT:  .LBB128_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19737,32 +19737,32 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB128_2
 ; RV64I-NEXT:  .LBB128_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB128_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB128_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB128_4
 ; RV64I-NEXT:  .LBB128_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bgeu s3, a0, .LBB128_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s3, a1, .LBB128_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB128_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB128_1
 ; RV64I-NEXT:  .LBB128_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -19895,32 +19895,32 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB129_2
 ; RV32I-NEXT:  .LBB129_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB129_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB129_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB129_4
 ; RV32I-NEXT:  .LBB129_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgeu s3, a0, .LBB129_1
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s3, a1, .LBB129_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB129_2 Depth=1
 ; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB129_1
 ; RV32I-NEXT:  .LBB129_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -19964,32 +19964,32 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB129_2
 ; RV64I-NEXT:  .LBB129_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB129_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB129_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB129_4
 ; RV64I-NEXT:  .LBB129_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bgeu s3, a0, .LBB129_1
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s3, a1, .LBB129_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB129_2 Depth=1
 ; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB129_1
 ; RV64I-NEXT:  .LBB129_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22176,30 +22176,30 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB165_2
 ; RV32I-NEXT:  .LBB165_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB165_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB165_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB165_4
 ; RV32I-NEXT:  .LBB165_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB165_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB165_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB165_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB165_1
 ; RV32I-NEXT:  .LBB165_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22218,31 +22218,31 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB165_2
 ; RV64I-NEXT:  .LBB165_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB165_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB165_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB165_4
 ; RV64I-NEXT:  .LBB165_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB165_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB165_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB165_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB165_1
 ; RV64I-NEXT:  .LBB165_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22265,30 +22265,30 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB166_2
 ; RV32I-NEXT:  .LBB166_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB166_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB166_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB166_4
 ; RV32I-NEXT:  .LBB166_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB166_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB166_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB166_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB166_1
 ; RV32I-NEXT:  .LBB166_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22312,31 +22312,31 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB166_2
 ; RV64I-NEXT:  .LBB166_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB166_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB166_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB166_4
 ; RV64I-NEXT:  .LBB166_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB166_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB166_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB166_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB166_1
 ; RV64I-NEXT:  .LBB166_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22364,30 +22364,30 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB167_2
 ; RV32I-NEXT:  .LBB167_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB167_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB167_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB167_4
 ; RV32I-NEXT:  .LBB167_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB167_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB167_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB167_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB167_1
 ; RV32I-NEXT:  .LBB167_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22411,31 +22411,31 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB167_2
 ; RV64I-NEXT:  .LBB167_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB167_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB167_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB167_4
 ; RV64I-NEXT:  .LBB167_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB167_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB167_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB167_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB167_1
 ; RV64I-NEXT:  .LBB167_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22463,30 +22463,30 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB168_2
 ; RV32I-NEXT:  .LBB168_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB168_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB168_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB168_4
 ; RV32I-NEXT:  .LBB168_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB168_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB168_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB168_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB168_1
 ; RV32I-NEXT:  .LBB168_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22510,31 +22510,31 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB168_2
 ; RV64I-NEXT:  .LBB168_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB168_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB168_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB168_4
 ; RV64I-NEXT:  .LBB168_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB168_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB168_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB168_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB168_1
 ; RV64I-NEXT:  .LBB168_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22562,30 +22562,30 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB169_2
 ; RV32I-NEXT:  .LBB169_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB169_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB169_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB169_4
 ; RV32I-NEXT:  .LBB169_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB169_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB169_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB169_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB169_1
 ; RV32I-NEXT:  .LBB169_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22609,31 +22609,31 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB169_2
 ; RV64I-NEXT:  .LBB169_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB169_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB169_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB169_4
 ; RV64I-NEXT:  .LBB169_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB169_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB169_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB169_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB169_1
 ; RV64I-NEXT:  .LBB169_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22661,30 +22661,30 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB170_2
 ; RV32I-NEXT:  .LBB170_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB170_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB170_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB170_4
 ; RV32I-NEXT:  .LBB170_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB170_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB170_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB170_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB170_1
 ; RV32I-NEXT:  .LBB170_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22703,31 +22703,31 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB170_2
 ; RV64I-NEXT:  .LBB170_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB170_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB170_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB170_4
 ; RV64I-NEXT:  .LBB170_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB170_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB170_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB170_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB170_1
 ; RV64I-NEXT:  .LBB170_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22750,30 +22750,30 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB171_2
 ; RV32I-NEXT:  .LBB171_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB171_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB171_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB171_4
 ; RV32I-NEXT:  .LBB171_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB171_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB171_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB171_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB171_1
 ; RV32I-NEXT:  .LBB171_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22797,31 +22797,31 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB171_2
 ; RV64I-NEXT:  .LBB171_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB171_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB171_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB171_4
 ; RV64I-NEXT:  .LBB171_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB171_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB171_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB171_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB171_1
 ; RV64I-NEXT:  .LBB171_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22849,30 +22849,30 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB172_2
 ; RV32I-NEXT:  .LBB172_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB172_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB172_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB172_4
 ; RV32I-NEXT:  .LBB172_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB172_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB172_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB172_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB172_1
 ; RV32I-NEXT:  .LBB172_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22896,31 +22896,31 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB172_2
 ; RV64I-NEXT:  .LBB172_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB172_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB172_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB172_4
 ; RV64I-NEXT:  .LBB172_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB172_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB172_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB172_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB172_1
 ; RV64I-NEXT:  .LBB172_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -22948,30 +22948,30 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB173_2
 ; RV32I-NEXT:  .LBB173_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB173_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB173_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB173_4
 ; RV32I-NEXT:  .LBB173_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB173_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB173_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB173_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB173_1
 ; RV32I-NEXT:  .LBB173_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -22995,31 +22995,31 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB173_2
 ; RV64I-NEXT:  .LBB173_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB173_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB173_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB173_4
 ; RV64I-NEXT:  .LBB173_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB173_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB173_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB173_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB173_1
 ; RV64I-NEXT:  .LBB173_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23047,30 +23047,30 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB174_2
 ; RV32I-NEXT:  .LBB174_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB174_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB174_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB174_4
 ; RV32I-NEXT:  .LBB174_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB174_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB174_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB174_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB174_1
 ; RV32I-NEXT:  .LBB174_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23094,31 +23094,31 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB174_2
 ; RV64I-NEXT:  .LBB174_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB174_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB174_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB174_4
 ; RV64I-NEXT:  .LBB174_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB174_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB174_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB174_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB174_1
 ; RV64I-NEXT:  .LBB174_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23146,30 +23146,30 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB175_2
 ; RV32I-NEXT:  .LBB175_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB175_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB175_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB175_4
 ; RV32I-NEXT:  .LBB175_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB175_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB175_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB175_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB175_1
 ; RV32I-NEXT:  .LBB175_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23188,31 +23188,31 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB175_2
 ; RV64I-NEXT:  .LBB175_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB175_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB175_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB175_4
 ; RV64I-NEXT:  .LBB175_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB175_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB175_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB175_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB175_1
 ; RV64I-NEXT:  .LBB175_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23235,30 +23235,30 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB176_2
 ; RV32I-NEXT:  .LBB176_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB176_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB176_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB176_4
 ; RV32I-NEXT:  .LBB176_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB176_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB176_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB176_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB176_1
 ; RV32I-NEXT:  .LBB176_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23282,31 +23282,31 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB176_2
 ; RV64I-NEXT:  .LBB176_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB176_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB176_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB176_4
 ; RV64I-NEXT:  .LBB176_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB176_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB176_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB176_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB176_1
 ; RV64I-NEXT:  .LBB176_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23334,30 +23334,30 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB177_2
 ; RV32I-NEXT:  .LBB177_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB177_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB177_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB177_4
 ; RV32I-NEXT:  .LBB177_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB177_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB177_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB177_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB177_1
 ; RV32I-NEXT:  .LBB177_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23381,31 +23381,31 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB177_2
 ; RV64I-NEXT:  .LBB177_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB177_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB177_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB177_4
 ; RV64I-NEXT:  .LBB177_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB177_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB177_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB177_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB177_1
 ; RV64I-NEXT:  .LBB177_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23433,30 +23433,30 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB178_2
 ; RV32I-NEXT:  .LBB178_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB178_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB178_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB178_4
 ; RV32I-NEXT:  .LBB178_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB178_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB178_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB178_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB178_1
 ; RV32I-NEXT:  .LBB178_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23480,31 +23480,31 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB178_2
 ; RV64I-NEXT:  .LBB178_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB178_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB178_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB178_4
 ; RV64I-NEXT:  .LBB178_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB178_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB178_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB178_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB178_1
 ; RV64I-NEXT:  .LBB178_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23532,30 +23532,30 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB179_2
 ; RV32I-NEXT:  .LBB179_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB179_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB179_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB179_4
 ; RV32I-NEXT:  .LBB179_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB179_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB179_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB179_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB179_1
 ; RV32I-NEXT:  .LBB179_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23579,31 +23579,31 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB179_2
 ; RV64I-NEXT:  .LBB179_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB179_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB179_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB179_4
 ; RV64I-NEXT:  .LBB179_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB179_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB179_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB179_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB179_1
 ; RV64I-NEXT:  .LBB179_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23631,30 +23631,30 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB180_2
 ; RV32I-NEXT:  .LBB180_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB180_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB180_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB180_4
 ; RV32I-NEXT:  .LBB180_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB180_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB180_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB180_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB180_1
 ; RV32I-NEXT:  .LBB180_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23673,31 +23673,31 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB180_2
 ; RV64I-NEXT:  .LBB180_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB180_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB180_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB180_4
 ; RV64I-NEXT:  .LBB180_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB180_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB180_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB180_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB180_1
 ; RV64I-NEXT:  .LBB180_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23720,30 +23720,30 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB181_2
 ; RV32I-NEXT:  .LBB181_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB181_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB181_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB181_4
 ; RV32I-NEXT:  .LBB181_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB181_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB181_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB181_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB181_1
 ; RV32I-NEXT:  .LBB181_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23767,31 +23767,31 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB181_2
 ; RV64I-NEXT:  .LBB181_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB181_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB181_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB181_4
 ; RV64I-NEXT:  .LBB181_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB181_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB181_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB181_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB181_1
 ; RV64I-NEXT:  .LBB181_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23819,30 +23819,30 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB182_2
 ; RV32I-NEXT:  .LBB182_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB182_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB182_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB182_4
 ; RV32I-NEXT:  .LBB182_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB182_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB182_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB182_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB182_1
 ; RV32I-NEXT:  .LBB182_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23866,31 +23866,31 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB182_2
 ; RV64I-NEXT:  .LBB182_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB182_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB182_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB182_4
 ; RV64I-NEXT:  .LBB182_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB182_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB182_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB182_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB182_1
 ; RV64I-NEXT:  .LBB182_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -23918,30 +23918,30 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB183_2
 ; RV32I-NEXT:  .LBB183_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB183_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 4
 ; RV32I-NEXT:    li a4, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB183_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB183_4
 ; RV32I-NEXT:  .LBB183_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB183_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB183_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB183_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB183_1
 ; RV32I-NEXT:  .LBB183_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -23965,31 +23965,31 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB183_2
 ; RV64I-NEXT:  .LBB183_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB183_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB183_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB183_4
 ; RV64I-NEXT:  .LBB183_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB183_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB183_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB183_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB183_1
 ; RV64I-NEXT:  .LBB183_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -24017,30 +24017,30 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB184_2
 ; RV32I-NEXT:  .LBB184_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB184_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB184_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB184_4
 ; RV32I-NEXT:  .LBB184_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB184_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB184_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB184_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB184_1
 ; RV32I-NEXT:  .LBB184_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -24064,31 +24064,31 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB184_2
 ; RV64I-NEXT:  .LBB184_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB184_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB184_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB184_4
 ; RV64I-NEXT:  .LBB184_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB184_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB184_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB184_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB184_1
 ; RV64I-NEXT:  .LBB184_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -26073,45 +26073,44 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB220_2
 ; RV32I-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB220_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB220_7
 ; RV32I-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB220_4
+; RV32I-NEXT:    beq a1, s0, .LBB220_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB220_5
 ; RV32I-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB220_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB220_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB220_1
 ; RV32I-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26127,45 +26126,44 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB220_2
 ; RV32IA-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB220_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB220_7
 ; RV32IA-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB220_4
+; RV32IA-NEXT:    beq a1, s0, .LBB220_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB220_5
 ; RV32IA-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB220_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB220_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB220_1
 ; RV32IA-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26179,30 +26177,30 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB220_2
 ; RV64I-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB220_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB220_4
 ; RV64I-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB220_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB220_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB220_1
 ; RV64I-NEXT:  .LBB220_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -26226,45 +26224,44 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB221_2
 ; RV32I-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB221_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB221_7
 ; RV32I-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB221_4
+; RV32I-NEXT:    beq a1, s0, .LBB221_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB221_5
 ; RV32I-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB221_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB221_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB221_1
 ; RV32I-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26280,45 +26277,44 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB221_2
 ; RV32IA-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB221_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB221_7
 ; RV32IA-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB221_4
+; RV32IA-NEXT:    beq a1, s0, .LBB221_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB221_5
 ; RV32IA-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB221_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB221_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB221_1
 ; RV32IA-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26332,30 +26328,30 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB221_2
 ; RV64I-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB221_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB221_4
 ; RV64I-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB221_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB221_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB221_1
 ; RV64I-NEXT:  .LBB221_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -26384,45 +26380,44 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB222_2
 ; RV32I-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB222_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB222_7
 ; RV32I-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB222_4
+; RV32I-NEXT:    beq a1, s0, .LBB222_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB222_5
 ; RV32I-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB222_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB222_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB222_1
 ; RV32I-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26438,45 +26433,44 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB222_2
 ; RV32IA-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB222_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB222_7
 ; RV32IA-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB222_4
+; RV32IA-NEXT:    beq a1, s0, .LBB222_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB222_5
 ; RV32IA-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB222_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB222_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB222_1
 ; RV32IA-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26490,30 +26484,30 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB222_2
 ; RV64I-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB222_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB222_4
 ; RV64I-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB222_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB222_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB222_1
 ; RV64I-NEXT:  .LBB222_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -26542,45 +26536,44 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB223_2
 ; RV32I-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB223_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB223_7
 ; RV32I-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB223_4
+; RV32I-NEXT:    beq a1, s0, .LBB223_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB223_5
 ; RV32I-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB223_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB223_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB223_1
 ; RV32I-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26596,45 +26589,44 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB223_2
 ; RV32IA-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB223_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB223_7
 ; RV32IA-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB223_4
+; RV32IA-NEXT:    beq a1, s0, .LBB223_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB223_5
 ; RV32IA-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB223_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB223_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB223_1
 ; RV32IA-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26648,30 +26640,30 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB223_2
 ; RV64I-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB223_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB223_4
 ; RV64I-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB223_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB223_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB223_1
 ; RV64I-NEXT:  .LBB223_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -26700,45 +26692,44 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB224_2
 ; RV32I-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB224_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB224_7
 ; RV32I-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB224_4
+; RV32I-NEXT:    beq a1, s0, .LBB224_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB224_5
 ; RV32I-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB224_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB224_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB224_1
 ; RV32I-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26754,45 +26745,44 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB224_2
 ; RV32IA-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB224_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB224_7
 ; RV32IA-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB224_4
+; RV32IA-NEXT:    beq a1, s0, .LBB224_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB224_5
 ; RV32IA-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB224_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB224_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB224_1
 ; RV32IA-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26806,30 +26796,30 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB224_2
 ; RV64I-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB224_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB224_4
 ; RV64I-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB224_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB224_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB224_1
 ; RV64I-NEXT:  .LBB224_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -26858,45 +26848,44 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB225_2
 ; RV32I-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB225_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB225_7
 ; RV32I-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB225_4
+; RV32I-NEXT:    beq a1, s0, .LBB225_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB225_5
 ; RV32I-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB225_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB225_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB225_1
 ; RV32I-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26912,45 +26901,44 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB225_2
 ; RV32IA-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB225_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB225_7
 ; RV32IA-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB225_4
+; RV32IA-NEXT:    beq a1, s0, .LBB225_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB225_5
 ; RV32IA-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB225_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB225_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB225_1
 ; RV32IA-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26964,30 +26952,30 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB225_2
 ; RV64I-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB225_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB225_4
 ; RV64I-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB225_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB225_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB225_1
 ; RV64I-NEXT:  .LBB225_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27011,45 +26999,44 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB226_2
 ; RV32I-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB226_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB226_7
 ; RV32I-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB226_4
+; RV32I-NEXT:    beq a1, s0, .LBB226_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB226_5
 ; RV32I-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB226_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB226_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB226_1
 ; RV32I-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27065,45 +27052,44 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB226_2
 ; RV32IA-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB226_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB226_7
 ; RV32IA-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB226_4
+; RV32IA-NEXT:    beq a1, s0, .LBB226_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB226_5
 ; RV32IA-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB226_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB226_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB226_1
 ; RV32IA-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27117,30 +27103,30 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB226_2
 ; RV64I-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB226_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB226_4
 ; RV64I-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB226_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB226_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB226_1
 ; RV64I-NEXT:  .LBB226_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27169,45 +27155,44 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB227_2
 ; RV32I-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB227_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB227_7
 ; RV32I-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB227_4
+; RV32I-NEXT:    beq a1, s0, .LBB227_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB227_5
 ; RV32I-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB227_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB227_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB227_1
 ; RV32I-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27223,45 +27208,44 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB227_2
 ; RV32IA-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB227_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB227_7
 ; RV32IA-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB227_4
+; RV32IA-NEXT:    beq a1, s0, .LBB227_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB227_5
 ; RV32IA-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB227_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB227_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB227_1
 ; RV32IA-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27275,30 +27259,30 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB227_2
 ; RV64I-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB227_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB227_4
 ; RV64I-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB227_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB227_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB227_1
 ; RV64I-NEXT:  .LBB227_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27327,45 +27311,44 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB228_2
 ; RV32I-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB228_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB228_7
 ; RV32I-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB228_4
+; RV32I-NEXT:    beq a1, s0, .LBB228_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB228_5
 ; RV32I-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB228_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB228_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB228_1
 ; RV32I-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27381,45 +27364,44 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB228_2
 ; RV32IA-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB228_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB228_7
 ; RV32IA-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB228_4
+; RV32IA-NEXT:    beq a1, s0, .LBB228_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB228_5
 ; RV32IA-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB228_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB228_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB228_1
 ; RV32IA-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27433,30 +27415,30 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB228_2
 ; RV64I-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB228_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB228_4
 ; RV64I-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB228_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB228_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB228_1
 ; RV64I-NEXT:  .LBB228_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27485,45 +27467,44 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB229_2
 ; RV32I-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB229_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB229_7
 ; RV32I-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB229_4
+; RV32I-NEXT:    beq a1, s0, .LBB229_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB229_5
 ; RV32I-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB229_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB229_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB229_1
 ; RV32I-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27539,45 +27520,44 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB229_2
 ; RV32IA-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB229_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB229_7
 ; RV32IA-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB229_4
+; RV32IA-NEXT:    beq a1, s0, .LBB229_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB229_5
 ; RV32IA-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB229_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB229_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB229_1
 ; RV32IA-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27591,30 +27571,30 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB229_2
 ; RV64I-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB229_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB229_4
 ; RV64I-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB229_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB229_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB229_1
 ; RV64I-NEXT:  .LBB229_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27643,45 +27623,44 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB230_2
 ; RV32I-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB230_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB230_7
 ; RV32I-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB230_4
+; RV32I-NEXT:    beq a1, s0, .LBB230_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB230_5
 ; RV32I-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB230_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB230_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB230_1
 ; RV32I-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27697,45 +27676,44 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB230_2
 ; RV32IA-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB230_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB230_7
 ; RV32IA-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB230_4
+; RV32IA-NEXT:    beq a1, s0, .LBB230_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB230_5
 ; RV32IA-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB230_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB230_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB230_1
 ; RV32IA-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27749,30 +27727,30 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB230_2
 ; RV64I-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB230_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB230_4
 ; RV64I-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB230_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB230_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB230_1
 ; RV64I-NEXT:  .LBB230_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27796,45 +27774,44 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB231_2
 ; RV32I-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB231_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB231_7
 ; RV32I-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB231_4
+; RV32I-NEXT:    beq a1, s0, .LBB231_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB231_5
 ; RV32I-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB231_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB231_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB231_1
 ; RV32I-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27850,45 +27827,44 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB231_2
 ; RV32IA-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB231_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB231_7
 ; RV32IA-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB231_4
+; RV32IA-NEXT:    beq a1, s0, .LBB231_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB231_5
 ; RV32IA-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB231_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB231_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB231_1
 ; RV32IA-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27902,30 +27878,30 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB231_2
 ; RV64I-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB231_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB231_4
 ; RV64I-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB231_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB231_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB231_1
 ; RV64I-NEXT:  .LBB231_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -27954,45 +27930,44 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB232_2
 ; RV32I-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB232_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB232_7
 ; RV32I-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB232_4
+; RV32I-NEXT:    beq a1, s0, .LBB232_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB232_5
 ; RV32I-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB232_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB232_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB232_1
 ; RV32I-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28008,45 +27983,44 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB232_2
 ; RV32IA-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB232_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB232_7
 ; RV32IA-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB232_4
+; RV32IA-NEXT:    beq a1, s0, .LBB232_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB232_5
 ; RV32IA-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB232_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB232_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB232_1
 ; RV32IA-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28060,30 +28034,30 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB232_2
 ; RV64I-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB232_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB232_4
 ; RV64I-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB232_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB232_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB232_1
 ; RV64I-NEXT:  .LBB232_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28112,45 +28086,44 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB233_2
 ; RV32I-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB233_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB233_7
 ; RV32I-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB233_4
+; RV32I-NEXT:    beq a1, s0, .LBB233_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB233_5
 ; RV32I-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB233_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB233_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB233_1
 ; RV32I-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28166,45 +28139,44 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB233_2
 ; RV32IA-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB233_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB233_7
 ; RV32IA-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB233_4
+; RV32IA-NEXT:    beq a1, s0, .LBB233_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB233_5
 ; RV32IA-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB233_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB233_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB233_1
 ; RV32IA-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28218,30 +28190,30 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB233_2
 ; RV64I-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB233_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB233_4
 ; RV64I-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB233_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB233_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB233_1
 ; RV64I-NEXT:  .LBB233_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28270,45 +28242,44 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB234_2
 ; RV32I-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB234_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB234_7
 ; RV32I-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB234_4
+; RV32I-NEXT:    beq a1, s0, .LBB234_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB234_5
 ; RV32I-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB234_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB234_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB234_1
 ; RV32I-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28324,45 +28295,44 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB234_2
 ; RV32IA-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB234_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB234_7
 ; RV32IA-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB234_4
+; RV32IA-NEXT:    beq a1, s0, .LBB234_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB234_5
 ; RV32IA-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB234_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB234_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB234_1
 ; RV32IA-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28376,30 +28346,30 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB234_2
 ; RV64I-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB234_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB234_4
 ; RV64I-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB234_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB234_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB234_1
 ; RV64I-NEXT:  .LBB234_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28428,45 +28398,44 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB235_2
 ; RV32I-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB235_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB235_7
 ; RV32I-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB235_4
+; RV32I-NEXT:    beq a1, s0, .LBB235_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB235_5
 ; RV32I-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB235_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB235_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB235_1
 ; RV32I-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28482,45 +28451,44 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB235_2
 ; RV32IA-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB235_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB235_7
 ; RV32IA-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB235_4
+; RV32IA-NEXT:    beq a1, s0, .LBB235_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB235_5
 ; RV32IA-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB235_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB235_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB235_1
 ; RV32IA-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28534,30 +28502,30 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB235_2
 ; RV64I-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB235_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB235_4
 ; RV64I-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB235_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB235_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB235_1
 ; RV64I-NEXT:  .LBB235_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28581,45 +28549,44 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB236_2
 ; RV32I-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB236_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB236_7
 ; RV32I-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB236_4
+; RV32I-NEXT:    beq a1, s0, .LBB236_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB236_5
 ; RV32I-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB236_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB236_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB236_1
 ; RV32I-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28635,45 +28602,44 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB236_2
 ; RV32IA-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB236_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB236_7
 ; RV32IA-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB236_4
+; RV32IA-NEXT:    beq a1, s0, .LBB236_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB236_5
 ; RV32IA-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB236_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB236_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB236_1
 ; RV32IA-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28687,30 +28653,30 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB236_2
 ; RV64I-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB236_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB236_4
 ; RV64I-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB236_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB236_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB236_1
 ; RV64I-NEXT:  .LBB236_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28739,45 +28705,44 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB237_2
 ; RV32I-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB237_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB237_7
 ; RV32I-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB237_4
+; RV32I-NEXT:    beq a1, s0, .LBB237_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB237_5
 ; RV32I-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB237_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB237_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB237_1
 ; RV32I-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28793,45 +28758,44 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB237_2
 ; RV32IA-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB237_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB237_7
 ; RV32IA-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB237_4
+; RV32IA-NEXT:    beq a1, s0, .LBB237_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB237_5
 ; RV32IA-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB237_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB237_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB237_1
 ; RV32IA-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28845,30 +28809,30 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB237_2
 ; RV64I-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 3
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB237_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB237_4
 ; RV64I-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB237_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB237_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB237_1
 ; RV64I-NEXT:  .LBB237_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -28897,45 +28861,44 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB238_2
 ; RV32I-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB238_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB238_7
 ; RV32I-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB238_4
+; RV32I-NEXT:    beq a1, s0, .LBB238_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB238_5
 ; RV32I-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB238_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB238_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB238_1
 ; RV32I-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28951,45 +28914,44 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB238_2
 ; RV32IA-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB238_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB238_7
 ; RV32IA-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB238_4
+; RV32IA-NEXT:    beq a1, s0, .LBB238_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB238_5
 ; RV32IA-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB238_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB238_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB238_1
 ; RV32IA-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29003,30 +28965,30 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB238_2
 ; RV64I-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 2
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB238_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB238_4
 ; RV64I-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB238_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB238_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB238_1
 ; RV64I-NEXT:  .LBB238_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -29055,45 +29017,44 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB239_2
 ; RV32I-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB239_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB239_7
 ; RV32I-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB239_4
+; RV32I-NEXT:    beq a1, s0, .LBB239_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB239_5
 ; RV32I-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB239_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB239_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB239_1
 ; RV32I-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29109,45 +29070,44 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB239_2
 ; RV32IA-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB239_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB239_7
 ; RV32IA-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB239_4
+; RV32IA-NEXT:    beq a1, s0, .LBB239_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB239_5
 ; RV32IA-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB239_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB239_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB239_1
 ; RV32IA-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29161,30 +29121,30 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB239_2
 ; RV64I-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB239_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB239_4
 ; RV64I-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB239_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB239_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB239_1
 ; RV64I-NEXT:  .LBB239_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index aea7473ceece4..81c47f8701c50 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -586,34 +586,34 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    slli a0, s0, 24
 ; RV32I-NEXT:    srai s2, a0, 24
 ; RV32I-NEXT:    j .LBB10_2
 ; RV32I-NEXT:  .LBB10_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB10_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
+; RV32I-NEXT:    lbu a1, 15(sp)
 ; RV32I-NEXT:    bnez a0, .LBB10_4
 ; RV32I-NEXT:  .LBB10_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    blt s2, a0, .LBB10_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB10_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB10_1
 ; RV32I-NEXT:  .LBB10_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -660,34 +660,34 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    slli a0, s0, 56
 ; RV64I-NEXT:    srai s2, a0, 56
 ; RV64I-NEXT:    j .LBB10_2
 ; RV64I-NEXT:  .LBB10_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB10_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a1, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
+; RV64I-NEXT:    lbu a1, 15(sp)
 ; RV64I-NEXT:    bnez a0, .LBB10_4
 ; RV64I-NEXT:  .LBB10_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    blt s2, a0, .LBB10_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB10_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB10_1
 ; RV64I-NEXT:  .LBB10_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -738,34 +738,34 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 24
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    slli a0, s0, 24
 ; RV32I-NEXT:    srai s2, a0, 24
 ; RV32I-NEXT:    j .LBB11_2
 ; RV32I-NEXT:  .LBB11_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
+; RV32I-NEXT:    lbu a1, 15(sp)
 ; RV32I-NEXT:    bnez a0, .LBB11_4
 ; RV32I-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    bge s2, a0, .LBB11_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB11_1
 ; RV32I-NEXT:  .LBB11_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -812,34 +812,34 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 56
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    slli a0, s0, 56
 ; RV64I-NEXT:    srai s2, a0, 56
 ; RV64I-NEXT:    j .LBB11_2
 ; RV64I-NEXT:  .LBB11_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a1, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
+; RV64I-NEXT:    lbu a1, 15(sp)
 ; RV64I-NEXT:    bnez a0, .LBB11_4
 ; RV64I-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    bge s2, a0, .LBB11_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB11_1
 ; RV64I-NEXT:  .LBB11_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -890,32 +890,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    andi s2, a1, 255
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    andi s2, s0, 255
 ; RV32I-NEXT:    j .LBB12_2
 ; RV32I-NEXT:  .LBB12_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB12_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
+; RV32I-NEXT:    lbu a1, 15(sp)
 ; RV32I-NEXT:    bnez a0, .LBB12_4
 ; RV32I-NEXT:  .LBB12_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    andi a0, a1, 255
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    bltu s2, a0, .LBB12_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB12_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB12_1
 ; RV32I-NEXT:  .LBB12_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -957,32 +957,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    andi s2, a1, 255
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    andi s2, s0, 255
 ; RV64I-NEXT:    j .LBB12_2
 ; RV64I-NEXT:  .LBB12_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB12_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a1, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
+; RV64I-NEXT:    lbu a1, 15(sp)
 ; RV64I-NEXT:    bnez a0, .LBB12_4
 ; RV64I-NEXT:  .LBB12_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    andi a0, a1, 255
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    bltu s2, a0, .LBB12_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB12_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB12_1
 ; RV64I-NEXT:  .LBB12_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -1028,32 +1028,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    andi s2, a1, 255
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    andi s2, s0, 255
 ; RV32I-NEXT:    j .LBB13_2
 ; RV32I-NEXT:  .LBB13_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
+; RV32I-NEXT:    lbu a1, 15(sp)
 ; RV32I-NEXT:    bnez a0, .LBB13_4
 ; RV32I-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    andi a0, a1, 255
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    bgeu s2, a0, .LBB13_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB13_1
 ; RV32I-NEXT:  .LBB13_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 24
+; RV32I-NEXT:    slli a0, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1095,32 +1095,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    andi s2, a1, 255
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    andi s2, s0, 255
 ; RV64I-NEXT:    j .LBB13_2
 ; RV64I-NEXT:  .LBB13_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a1, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
+; RV64I-NEXT:    lbu a1, 15(sp)
 ; RV64I-NEXT:    bnez a0, .LBB13_4
 ; RV64I-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    andi a0, a1, 255
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    bgeu s2, a0, .LBB13_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB13_1
 ; RV64I-NEXT:  .LBB13_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 56
+; RV64I-NEXT:    slli a0, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -1634,34 +1634,34 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    slli a0, s0, 16
 ; RV32I-NEXT:    srai s2, a0, 16
 ; RV32I-NEXT:    j .LBB21_2
 ; RV32I-NEXT:  .LBB21_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB21_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a1, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
+; RV32I-NEXT:    lh a1, 14(sp)
 ; RV32I-NEXT:    bnez a0, .LBB21_4
 ; RV32I-NEXT:  .LBB21_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
+; RV32I-NEXT:    slli a0, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    blt s2, a0, .LBB21_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB21_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB21_1
 ; RV32I-NEXT:  .LBB21_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 16
+; RV32I-NEXT:    slli a0, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1710,34 +1710,34 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    slli a0, s0, 48
 ; RV64I-NEXT:    srai s2, a0, 48
 ; RV64I-NEXT:    j .LBB21_2
 ; RV64I-NEXT:  .LBB21_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB21_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a1, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
+; RV64I-NEXT:    lh a1, 14(sp)
 ; RV64I-NEXT:    bnez a0, .LBB21_4
 ; RV64I-NEXT:  .LBB21_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
+; RV64I-NEXT:    slli a0, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    blt s2, a0, .LBB21_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB21_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB21_1
 ; RV64I-NEXT:  .LBB21_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 48
+; RV64I-NEXT:    slli a0, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -1790,34 +1790,34 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    slli a0, a1, 16
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    slli a0, s0, 16
 ; RV32I-NEXT:    srai s2, a0, 16
 ; RV32I-NEXT:    j .LBB22_2
 ; RV32I-NEXT:  .LBB22_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB22_2 Depth=1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    sh a1, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
+; RV32I-NEXT:    lh a1, 14(sp)
 ; RV32I-NEXT:    bnez a0, .LBB22_4
 ; RV32I-NEXT:  .LBB22_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    slli a0, a3, 16
+; RV32I-NEXT:    slli a0, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
-; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    bge s2, a0, .LBB22_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB22_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB22_1
 ; RV32I-NEXT:  .LBB22_4: # %atomicrmw.end
-; RV32I-NEXT:    slli a0, a3, 16
+; RV32I-NEXT:    slli a0, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1866,34 +1866,34 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    slli a0, a1, 48
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    slli a0, s0, 48
 ; RV64I-NEXT:    srai s2, a0, 48
 ; RV64I-NEXT:    j .LBB22_2
 ; RV64I-NEXT:  .LBB22_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB22_2 Depth=1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    sh a1, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
+; RV64I-NEXT:    lh a1, 14(sp)
 ; RV64I-NEXT:    bnez a0, .LBB22_4
 ; RV64I-NEXT:  .LBB22_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    slli a0, a3, 48
+; RV64I-NEXT:    slli a0, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mv a2, a3
+; RV64I-NEXT:    mv a2, a1
 ; RV64I-NEXT:    bge s2, a0, .LBB22_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB22_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB22_1
 ; RV64I-NEXT:  .LBB22_4: # %atomicrmw.end
-; RV64I-NEXT:    slli a0, a3, 48
+; RV64I-NEXT:    slli a0, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -2530,30 +2530,30 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB32_2
 ; RV32I-NEXT:  .LBB32_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB32_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB32_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB32_4
 ; RV32I-NEXT:  .LBB32_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    blt s1, a3, .LBB32_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt s0, a0, .LBB32_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB32_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB32_1
 ; RV32I-NEXT:  .LBB32_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2572,31 +2572,31 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB32_2
 ; RV64I-NEXT:  .LBB32_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB32_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB32_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB32_4
 ; RV64I-NEXT:  .LBB32_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s2, a3, .LBB32_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s2, a0, .LBB32_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB32_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB32_1
 ; RV64I-NEXT:  .LBB32_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2619,30 +2619,30 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB33_2
 ; RV32I-NEXT:  .LBB33_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB33_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB33_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB33_4
 ; RV32I-NEXT:  .LBB33_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bge s1, a3, .LBB33_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bge s0, a0, .LBB33_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB33_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB33_1
 ; RV32I-NEXT:  .LBB33_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2661,31 +2661,31 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB33_2
 ; RV64I-NEXT:  .LBB33_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB33_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB33_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB33_4
 ; RV64I-NEXT:  .LBB33_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s2, a3, .LBB33_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s2, a0, .LBB33_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB33_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB33_1
 ; RV64I-NEXT:  .LBB33_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2708,30 +2708,30 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB34_2
 ; RV32I-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB34_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB34_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB34_4
 ; RV32I-NEXT:  .LBB34_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bltu s1, a3, .LBB34_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu s0, a0, .LBB34_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB34_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB34_1
 ; RV32I-NEXT:  .LBB34_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2750,31 +2750,31 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB34_2
 ; RV64I-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB34_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB34_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB34_4
 ; RV64I-NEXT:  .LBB34_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s2, a3, .LBB34_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s2, a0, .LBB34_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB34_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB34_1
 ; RV64I-NEXT:  .LBB34_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2797,30 +2797,30 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB35_2
 ; RV32I-NEXT:  .LBB35_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB35_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB35_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB35_4
 ; RV32I-NEXT:  .LBB35_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a3
-; RV32I-NEXT:    bgeu s1, a3, .LBB35_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgeu s0, a0, .LBB35_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB35_2 Depth=1
-; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a2, s0
 ; RV32I-NEXT:    j .LBB35_1
 ; RV32I-NEXT:  .LBB35_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2839,31 +2839,31 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB35_2
 ; RV64I-NEXT:  .LBB35_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB35_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB35_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB35_4
 ; RV64I-NEXT:  .LBB35_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s2, a3, .LBB35_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s2, a0, .LBB35_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB35_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB35_1
 ; RV64I-NEXT:  .LBB35_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -3183,45 +3183,44 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB43_2
 ; RV32I-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB43_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB43_7
 ; RV32I-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB43_4
+; RV32I-NEXT:    beq a1, s0, .LBB43_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB43_5
 ; RV32I-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB43_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB43_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB43_1
 ; RV32I-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3237,45 +3236,44 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB43_2
 ; RV32IA-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB43_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB43_7
 ; RV32IA-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB43_4
+; RV32IA-NEXT:    beq a1, s0, .LBB43_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB43_5
 ; RV32IA-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB43_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB43_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB43_1
 ; RV32IA-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3289,30 +3287,30 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB43_2
 ; RV64I-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB43_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB43_4
 ; RV64I-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    blt s1, a3, .LBB43_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt s0, a0, .LBB43_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB43_1
 ; RV64I-NEXT:  .LBB43_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3336,45 +3334,44 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB44_2
 ; RV32I-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB44_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB44_7
 ; RV32I-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB44_4
+; RV32I-NEXT:    beq a1, s0, .LBB44_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    slt a0, s0, a5
+; RV32I-NEXT:    slt a4, s0, a1
 ; RV32I-NEXT:    j .LBB44_5
 ; RV32I-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB44_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB44_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB44_1
 ; RV32I-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3390,45 +3387,44 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB44_2
 ; RV32IA-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB44_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB44_7
 ; RV32IA-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB44_4
+; RV32IA-NEXT:    beq a1, s0, .LBB44_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    slt a0, s0, a5
+; RV32IA-NEXT:    slt a4, s0, a1
 ; RV32IA-NEXT:    j .LBB44_5
 ; RV32IA-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB44_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB44_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB44_1
 ; RV32IA-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3442,30 +3438,30 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB44_2
 ; RV64I-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB44_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB44_4
 ; RV64I-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bge s1, a3, .LBB44_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bge s0, a0, .LBB44_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB44_1
 ; RV64I-NEXT:  .LBB44_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3489,45 +3485,44 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB45_2
 ; RV32I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB45_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB45_7
 ; RV32I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB45_4
+; RV32I-NEXT:    beq a1, s0, .LBB45_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB45_5
 ; RV32I-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    bnez a0, .LBB45_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    bnez a4, .LBB45_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB45_1
 ; RV32I-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3543,45 +3538,44 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB45_2
 ; RV32IA-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB45_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB45_7
 ; RV32IA-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB45_4
+; RV32IA-NEXT:    beq a1, s0, .LBB45_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB45_5
 ; RV32IA-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    bnez a0, .LBB45_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    bnez a4, .LBB45_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB45_1
 ; RV32IA-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3595,30 +3589,30 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB45_2
 ; RV64I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB45_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB45_4
 ; RV64I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bltu s1, a3, .LBB45_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu s0, a0, .LBB45_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB45_1
 ; RV64I-NEXT:  .LBB45_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3642,45 +3636,44 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB46_2
 ; RV32I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB46_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB46_7
 ; RV32I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a5, s0, .LBB46_4
+; RV32I-NEXT:    beq a1, s0, .LBB46_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a5
+; RV32I-NEXT:    sltu a4, s0, a1
 ; RV32I-NEXT:    j .LBB46_5
 ; RV32I-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a4
+; RV32I-NEXT:    sltu a4, s1, a0
 ; RV32I-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, a4
-; RV32I-NEXT:    mv a3, a5
-; RV32I-NEXT:    beqz a0, .LBB46_1
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    beqz a4, .LBB46_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB46_1
 ; RV32I-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3696,45 +3689,44 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB46_2
 ; RV32IA-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB46_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB46_7
 ; RV32IA-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a5, s0, .LBB46_4
+; RV32IA-NEXT:    beq a1, s0, .LBB46_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a5
+; RV32IA-NEXT:    sltu a4, s0, a1
 ; RV32IA-NEXT:    j .LBB46_5
 ; RV32IA-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a4
+; RV32IA-NEXT:    sltu a4, s1, a0
 ; RV32IA-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    mv a2, a4
-; RV32IA-NEXT:    mv a3, a5
-; RV32IA-NEXT:    beqz a0, .LBB46_1
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    mv a3, a1
+; RV32IA-NEXT:    beqz a4, .LBB46_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB46_1
 ; RV32IA-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3748,30 +3740,30 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB46_2
 ; RV64I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB46_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB46_4
 ; RV64I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a3
-; RV64I-NEXT:    bgeu s1, a3, .LBB46_1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bgeu s0, a0, .LBB46_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV64I-NEXT:    mv a2, s1
+; RV64I-NEXT:    mv a2, s0
 ; RV64I-NEXT:    j .LBB46_1
 ; RV64I-NEXT:  .LBB46_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -4298,10 +4290,10 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amoswap.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB53_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 0(a0)
 ; RV32IA-NEXT:    li a2, 1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_xchg_i32_monotonic_crossbb:
@@ -4334,10 +4326,10 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amoswap.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB53_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
+; RV64IA-NEXT:    lw a1, 0(a0)
 ; RV64IA-NEXT:    li a2, 1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4385,10 +4377,10 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB54_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    addi a2, a0, 1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    addi a2, a1, 1
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_add_i32_monotonic_crossbb:
@@ -4421,10 +4413,10 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB54_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    addi a2, a0, 1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    addi a2, a1, 1
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4473,10 +4465,10 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB55_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    addi a2, a0, -1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    addi a2, a1, -1
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_sub_i32_monotonic_crossbb:
@@ -4509,10 +4501,10 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB55_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    addi a2, a0, -1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    addi a2, a1, -1
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4561,10 +4553,10 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amoand.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB56_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    andi a2, a0, 1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    andi a2, a1, 1
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_and_i32_monotonic_crossbb:
@@ -4597,10 +4589,10 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amoand.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB56_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    andi a2, a0, 1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    andi a2, a1, 1
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4642,24 +4634,25 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
 ; RV32IA-NOZACAS:       # %bb.0:
-; RV32IA-NOZACAS-NEXT:    andi a2, a1, 1
-; RV32IA-NOZACAS-NEXT:    mv a1, a0
-; RV32IA-NOZACAS-NEXT:    beqz a2, .LBB57_2
+; RV32IA-NOZACAS-NEXT:    andi a1, a1, 1
+; RV32IA-NOZACAS-NEXT:    beqz a1, .LBB57_2
 ; RV32IA-NOZACAS-NEXT:  # %bb.1: # %then
 ; RV32IA-NOZACAS-NEXT:    li a2, 1
 ; RV32IA-NOZACAS-NEXT:  .LBB57_3: # %then
 ; RV32IA-NOZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NOZACAS-NEXT:    lr.w a0, (a1)
-; RV32IA-NOZACAS-NEXT:    and a3, a0, a2
+; RV32IA-NOZACAS-NEXT:    lr.w a1, (a0)
+; RV32IA-NOZACAS-NEXT:    and a3, a1, a2
 ; RV32IA-NOZACAS-NEXT:    not a3, a3
-; RV32IA-NOZACAS-NEXT:    sc.w a3, a3, (a1)
+; RV32IA-NOZACAS-NEXT:    sc.w a3, a3, (a0)
 ; RV32IA-NOZACAS-NEXT:    bnez a3, .LBB57_3
 ; RV32IA-NOZACAS-NEXT:  # %bb.4: # %then
+; RV32IA-NOZACAS-NEXT:    mv a0, a1
 ; RV32IA-NOZACAS-NEXT:    ret
 ; RV32IA-NOZACAS-NEXT:  .LBB57_2: # %else
-; RV32IA-NOZACAS-NEXT:    lw a0, 0(a1)
-; RV32IA-NOZACAS-NEXT:    andi a2, a0, 1
-; RV32IA-NOZACAS-NEXT:    sw a2, 0(a1)
+; RV32IA-NOZACAS-NEXT:    lw a1, 0(a0)
+; RV32IA-NOZACAS-NEXT:    andi a2, a1, 1
+; RV32IA-NOZACAS-NEXT:    sw a2, 0(a0)
+; RV32IA-NOZACAS-NEXT:    mv a0, a1
 ; RV32IA-NOZACAS-NEXT:    ret
 ;
 ; RV32IA-ZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
@@ -4708,24 +4701,25 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
 ; RV64IA-NOZACAS:       # %bb.0:
-; RV64IA-NOZACAS-NEXT:    andi a2, a1, 1
-; RV64IA-NOZACAS-NEXT:    mv a1, a0
-; RV64IA-NOZACAS-NEXT:    beqz a2, .LBB57_2
+; RV64IA-NOZACAS-NEXT:    andi a1, a1, 1
+; RV64IA-NOZACAS-NEXT:    beqz a1, .LBB57_2
 ; RV64IA-NOZACAS-NEXT:  # %bb.1: # %then
 ; RV64IA-NOZACAS-NEXT:    li a2, 1
 ; RV64IA-NOZACAS-NEXT:  .LBB57_3: # %then
 ; RV64IA-NOZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w a0, (a1)
-; RV64IA-NOZACAS-NEXT:    and a3, a0, a2
+; RV64IA-NOZACAS-NEXT:    lr.w a1, (a0)
+; RV64IA-NOZACAS-NEXT:    and a3, a1, a2
 ; RV64IA-NOZACAS-NEXT:    not a3, a3
-; RV64IA-NOZACAS-NEXT:    sc.w a3, a3, (a1)
+; RV64IA-NOZACAS-NEXT:    sc.w a3, a3, (a0)
 ; RV64IA-NOZACAS-NEXT:    bnez a3, .LBB57_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.4: # %then
+; RV64IA-NOZACAS-NEXT:    mv a0, a1
 ; RV64IA-NOZACAS-NEXT:    ret
 ; RV64IA-NOZACAS-NEXT:  .LBB57_2: # %else
-; RV64IA-NOZACAS-NEXT:    lw a0, 0(a1)
-; RV64IA-NOZACAS-NEXT:    andi a2, a0, 1
-; RV64IA-NOZACAS-NEXT:    sw a2, 0(a1)
+; RV64IA-NOZACAS-NEXT:    lw a1, 0(a0)
+; RV64IA-NOZACAS-NEXT:    andi a2, a1, 1
+; RV64IA-NOZACAS-NEXT:    sw a2, 0(a0)
+; RV64IA-NOZACAS-NEXT:    mv a0, a1
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
@@ -4797,10 +4791,10 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind {
 ; RV32IA-NEXT:    amoor.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB58_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    ori a2, a0, 1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    ori a2, a1, 1
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_or_i32_monotonic_crossbb:
@@ -4833,10 +4827,10 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind {
 ; RV64IA-NEXT:    amoor.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB58_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    ori a2, a0, 1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    ori a2, a1, 1
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4885,10 +4879,10 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amoxor.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB59_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    xori a2, a0, 1
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    xori a2, a1, 1
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_xor_i32_monotonic_crossbb:
@@ -4921,10 +4915,10 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amoxor.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB59_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    xori a2, a0, 1
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    xori a2, a1, 1
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -4949,40 +4943,40 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB60_5
 ; RV32I-NEXT:  # %bb.1: # %then
-; RV32I-NEXT:    lw a1, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
 ; RV32I-NEXT:    j .LBB60_3
 ; RV32I-NEXT:  .LBB60_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB60_3 Depth=1
-; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    sw a0, 4(sp)
 ; RV32I-NEXT:    addi a1, sp, 4
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a1, 4(sp)
-; RV32I-NEXT:    bnez a0, .LBB60_8
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 4(sp)
+; RV32I-NEXT:    bnez a1, .LBB60_8
 ; RV32I-NEXT:  .LBB60_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bgtz a1, .LBB60_2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bgtz a0, .LBB60_2
 ; RV32I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB60_3 Depth=1
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    j .LBB60_2
 ; RV32I-NEXT:  .LBB60_5: # %else
-; RV32I-NEXT:    lw a1, 0(s0)
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bgtz a1, .LBB60_7
+; RV32I-NEXT:    lw a0, 0(s0)
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    bgtz a0, .LBB60_7
 ; RV32I-NEXT:  # %bb.6: # %else
-; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:  .LBB60_7: # %else
-; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 0(s0)
 ; RV32I-NEXT:  .LBB60_8: # %merge
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -4990,21 +4984,21 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV32IA-LABEL: atomicrmw_max_i32_monotonic_crossbb:
 ; RV32IA:       # %bb.0:
-; RV32IA-NEXT:    andi a2, a1, 1
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    beqz a2, .LBB60_2
+; RV32IA-NEXT:    andi a1, a1, 1
+; RV32IA-NEXT:    beqz a1, .LBB60_2
 ; RV32IA-NEXT:  # %bb.1: # %then
-; RV32IA-NEXT:    li a0, 1
-; RV32IA-NEXT:    amomax.w a0, a0, (a1)
+; RV32IA-NEXT:    li a1, 1
+; RV32IA-NEXT:    amomax.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB60_2: # %else
-; RV32IA-NEXT:    lw a0, 0(a1)
-; RV32IA-NEXT:    mv a2, a0
-; RV32IA-NEXT:    bgtz a0, .LBB60_4
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    mv a2, a1
+; RV32IA-NEXT:    bgtz a1, .LBB60_4
 ; RV32IA-NEXT:  # %bb.3: # %else
 ; RV32IA-NEXT:    li a2, 1
 ; RV32IA-NEXT:  .LBB60_4: # %else
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i32_monotonic_crossbb:
@@ -5012,41 +5006,41 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    addi sp, sp, -32
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB60_5
 ; RV64I-NEXT:  # %bb.1: # %then
-; RV64I-NEXT:    lw a1, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
 ; RV64I-NEXT:    j .LBB60_3
 ; RV64I-NEXT:  .LBB60_2: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB60_3 Depth=1
-; RV64I-NEXT:    sw a1, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a1, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB60_8
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB60_8
 ; RV64I-NEXT:  .LBB60_3: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    blt a0, a1, .LBB60_2
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt a1, a0, .LBB60_2
 ; RV64I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB60_3 Depth=1
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    j .LBB60_2
 ; RV64I-NEXT:  .LBB60_5: # %else
-; RV64I-NEXT:    lw a1, 0(s0)
-; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    bgtz a1, .LBB60_7
+; RV64I-NEXT:    lw a0, 0(s0)
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    bgtz a0, .LBB60_7
 ; RV64I-NEXT:  # %bb.6: # %else
-; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    li a1, 1
 ; RV64I-NEXT:  .LBB60_7: # %else
-; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    sw a1, 0(s0)
 ; RV64I-NEXT:  .LBB60_8: # %merge
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 32
@@ -5054,21 +5048,21 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV64IA-LABEL: atomicrmw_max_i32_monotonic_crossbb:
 ; RV64IA:       # %bb.0:
-; RV64IA-NEXT:    andi a2, a1, 1
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    beqz a2, .LBB60_2
+; RV64IA-NEXT:    andi a1, a1, 1
+; RV64IA-NEXT:    beqz a1, .LBB60_2
 ; RV64IA-NEXT:  # %bb.1: # %then
-; RV64IA-NEXT:    li a0, 1
-; RV64IA-NEXT:    amomax.w a0, a0, (a1)
+; RV64IA-NEXT:    li a1, 1
+; RV64IA-NEXT:    amomax.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB60_2: # %else
-; RV64IA-NEXT:    lw a0, 0(a1)
-; RV64IA-NEXT:    mv a2, a0
-; RV64IA-NEXT:    bgtz a0, .LBB60_4
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    mv a2, a1
+; RV64IA-NEXT:    bgtz a1, .LBB60_4
 ; RV64IA-NEXT:  # %bb.3: # %else
 ; RV64IA-NEXT:    li a2, 1
 ; RV64IA-NEXT:  .LBB60_4: # %else
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -5095,41 +5089,41 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB61_5
 ; RV32I-NEXT:  # %bb.1: # %then
-; RV32I-NEXT:    lw a1, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
 ; RV32I-NEXT:    li s1, 2
 ; RV32I-NEXT:    j .LBB61_3
 ; RV32I-NEXT:  .LBB61_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB61_3 Depth=1
-; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a1, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB61_8
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB61_8
 ; RV32I-NEXT:  .LBB61_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    blt a1, s1, .LBB61_2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    blt a0, s1, .LBB61_2
 ; RV32I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB61_3 Depth=1
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    j .LBB61_2
 ; RV32I-NEXT:  .LBB61_5: # %else
-; RV32I-NEXT:    lw a1, 0(s0)
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    blez a1, .LBB61_7
+; RV32I-NEXT:    lw a0, 0(s0)
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    blez a0, .LBB61_7
 ; RV32I-NEXT:  # %bb.6: # %else
-; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:  .LBB61_7: # %else
-; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 0(s0)
 ; RV32I-NEXT:  .LBB61_8: # %merge
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -5138,21 +5132,21 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV32IA-LABEL: atomicrmw_min_i32_monotonic_crossbb:
 ; RV32IA:       # %bb.0:
-; RV32IA-NEXT:    andi a2, a1, 1
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    beqz a2, .LBB61_2
+; RV32IA-NEXT:    andi a1, a1, 1
+; RV32IA-NEXT:    beqz a1, .LBB61_2
 ; RV32IA-NEXT:  # %bb.1: # %then
-; RV32IA-NEXT:    li a0, 1
-; RV32IA-NEXT:    amomin.w a0, a0, (a1)
+; RV32IA-NEXT:    li a1, 1
+; RV32IA-NEXT:    amomin.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB61_2: # %else
-; RV32IA-NEXT:    lw a0, 0(a1)
-; RV32IA-NEXT:    mv a2, a0
-; RV32IA-NEXT:    blez a0, .LBB61_4
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    mv a2, a1
+; RV32IA-NEXT:    blez a1, .LBB61_4
 ; RV32IA-NEXT:  # %bb.3: # %else
 ; RV32IA-NEXT:    li a2, 1
 ; RV32IA-NEXT:  .LBB61_4: # %else
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i32_monotonic_crossbb:
@@ -5161,41 +5155,41 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB61_5
 ; RV64I-NEXT:  # %bb.1: # %then
-; RV64I-NEXT:    lw a1, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
 ; RV64I-NEXT:    li s1, 2
 ; RV64I-NEXT:    j .LBB61_3
 ; RV64I-NEXT:  .LBB61_2: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB61_3 Depth=1
-; RV64I-NEXT:    sw a1, 4(sp)
+; RV64I-NEXT:    sw a0, 4(sp)
 ; RV64I-NEXT:    addi a1, sp, 4
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a1, 4(sp)
-; RV64I-NEXT:    bnez a0, .LBB61_8
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 4(sp)
+; RV64I-NEXT:    bnez a1, .LBB61_8
 ; RV64I-NEXT:  .LBB61_3: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    blt a1, s1, .LBB61_2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    blt a0, s1, .LBB61_2
 ; RV64I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB61_3 Depth=1
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    j .LBB61_2
 ; RV64I-NEXT:  .LBB61_5: # %else
-; RV64I-NEXT:    lw a1, 0(s0)
-; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    blez a1, .LBB61_7
+; RV64I-NEXT:    lw a0, 0(s0)
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    blez a0, .LBB61_7
 ; RV64I-NEXT:  # %bb.6: # %else
-; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    li a1, 1
 ; RV64I-NEXT:  .LBB61_7: # %else
-; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    sw a1, 0(s0)
 ; RV64I-NEXT:  .LBB61_8: # %merge
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -5204,21 +5198,21 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV64IA-LABEL: atomicrmw_min_i32_monotonic_crossbb:
 ; RV64IA:       # %bb.0:
-; RV64IA-NEXT:    andi a2, a1, 1
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    beqz a2, .LBB61_2
+; RV64IA-NEXT:    andi a1, a1, 1
+; RV64IA-NEXT:    beqz a1, .LBB61_2
 ; RV64IA-NEXT:  # %bb.1: # %then
-; RV64IA-NEXT:    li a0, 1
-; RV64IA-NEXT:    amomin.w a0, a0, (a1)
+; RV64IA-NEXT:    li a1, 1
+; RV64IA-NEXT:    amomin.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB61_2: # %else
-; RV64IA-NEXT:    lw a0, 0(a1)
-; RV64IA-NEXT:    mv a2, a0
-; RV64IA-NEXT:    blez a0, .LBB61_4
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    mv a2, a1
+; RV64IA-NEXT:    blez a1, .LBB61_4
 ; RV64IA-NEXT:  # %bb.3: # %else
 ; RV64IA-NEXT:    li a2, 1
 ; RV64IA-NEXT:  .LBB61_4: # %else
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -5244,31 +5238,31 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB62_3
 ; RV32I-NEXT:  # %bb.1: # %then
-; RV32I-NEXT:    lw a1, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
 ; RV32I-NEXT:  .LBB62_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    seqz a2, a1
-; RV32I-NEXT:    add a2, a1, a2
-; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    seqz a2, a0
+; RV32I-NEXT:    add a2, a0, a2
+; RV32I-NEXT:    sw a0, 4(sp)
 ; RV32I-NEXT:    addi a1, sp, 4
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a1, 4(sp)
-; RV32I-NEXT:    beqz a0, .LBB62_2
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 4(sp)
+; RV32I-NEXT:    beqz a1, .LBB62_2
 ; RV32I-NEXT:    j .LBB62_4
 ; RV32I-NEXT:  .LBB62_3: # %else
-; RV32I-NEXT:    lw a1, 0(s0)
-; RV32I-NEXT:    seqz a0, a1
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
+; RV32I-NEXT:    seqz a1, a0
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    sw a1, 0(s0)
 ; RV32I-NEXT:  .LBB62_4: # %merge
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -5283,11 +5277,11 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    amomaxu.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB62_2: # %else
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    lw a0, 0(a0)
-; RV32IA-NEXT:    seqz a2, a0
-; RV32IA-NEXT:    add a2, a0, a2
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
+; RV32IA-NEXT:    seqz a2, a1
+; RV32IA-NEXT:    add a2, a1, a2
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_umax_i32_monotonic_crossbb:
@@ -5295,38 +5289,38 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    addi sp, sp, -32
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB62_5
 ; RV64I-NEXT:  # %bb.1: # %then
-; RV64I-NEXT:    lw a1, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
 ; RV64I-NEXT:    j .LBB62_3
 ; RV64I-NEXT:  .LBB62_2: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB62_3 Depth=1
-; RV64I-NEXT:    sw a1, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a1, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB62_6
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB62_6
 ; RV64I-NEXT:  .LBB62_3: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu a0, a1, .LBB62_2
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB62_2
 ; RV64I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB62_3 Depth=1
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    j .LBB62_2
 ; RV64I-NEXT:  .LBB62_5: # %else
-; RV64I-NEXT:    lw a1, 0(s0)
-; RV64I-NEXT:    seqz a0, a1
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
+; RV64I-NEXT:    seqz a1, a0
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    sw a1, 0(s0)
 ; RV64I-NEXT:  .LBB62_6: # %merge
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 32
@@ -5341,11 +5335,11 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    amomaxu.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB62_2: # %else
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    lw a0, 0(a0)
-; RV64IA-NEXT:    seqz a2, a0
-; RV64IA-NEXT:    add a2, a0, a2
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
+; RV64IA-NEXT:    seqz a2, a1
+; RV64IA-NEXT:    add a2, a1, a2
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
@@ -5372,42 +5366,42 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    andi a1, a1, 1
 ; RV32I-NEXT:    beqz a1, .LBB63_5
 ; RV32I-NEXT:  # %bb.1: # %then
-; RV32I-NEXT:    lw a1, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
 ; RV32I-NEXT:    li s1, 2
 ; RV32I-NEXT:    j .LBB63_3
 ; RV32I-NEXT:  .LBB63_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB63_3 Depth=1
-; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a1, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB63_8
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB63_8
 ; RV32I-NEXT:  .LBB63_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltu a1, s1, .LBB63_2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    bltu a0, s1, .LBB63_2
 ; RV32I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB63_3 Depth=1
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    j .LBB63_2
 ; RV32I-NEXT:  .LBB63_5: # %else
-; RV32I-NEXT:    lw a1, 0(s0)
+; RV32I-NEXT:    lw a0, 0(s0)
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bltu a1, a2, .LBB63_7
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    bltu a0, a2, .LBB63_7
 ; RV32I-NEXT:  # %bb.6: # %else
-; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:  .LBB63_7: # %else
-; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 0(s0)
 ; RV32I-NEXT:  .LBB63_8: # %merge
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -5416,22 +5410,22 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV32IA-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
 ; RV32IA:       # %bb.0:
-; RV32IA-NEXT:    andi a2, a1, 1
-; RV32IA-NEXT:    mv a1, a0
-; RV32IA-NEXT:    beqz a2, .LBB63_2
+; RV32IA-NEXT:    andi a1, a1, 1
+; RV32IA-NEXT:    beqz a1, .LBB63_2
 ; RV32IA-NEXT:  # %bb.1: # %then
-; RV32IA-NEXT:    li a0, 1
-; RV32IA-NEXT:    amominu.w a0, a0, (a1)
+; RV32IA-NEXT:    li a1, 1
+; RV32IA-NEXT:    amominu.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ; RV32IA-NEXT:  .LBB63_2: # %else
-; RV32IA-NEXT:    lw a0, 0(a1)
+; RV32IA-NEXT:    lw a1, 0(a0)
 ; RV32IA-NEXT:    li a3, 1
-; RV32IA-NEXT:    mv a2, a0
-; RV32IA-NEXT:    bltu a0, a3, .LBB63_4
+; RV32IA-NEXT:    mv a2, a1
+; RV32IA-NEXT:    bltu a1, a3, .LBB63_4
 ; RV32IA-NEXT:  # %bb.3: # %else
 ; RV32IA-NEXT:    li a2, 1
 ; RV32IA-NEXT:  .LBB63_4: # %else
-; RV32IA-NEXT:    sw a2, 0(a1)
+; RV32IA-NEXT:    sw a2, 0(a0)
+; RV32IA-NEXT:    mv a0, a1
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
@@ -5440,42 +5434,42 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    andi a1, a1, 1
 ; RV64I-NEXT:    beqz a1, .LBB63_5
 ; RV64I-NEXT:  # %bb.1: # %then
-; RV64I-NEXT:    lw a1, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
 ; RV64I-NEXT:    li s1, 2
 ; RV64I-NEXT:    j .LBB63_3
 ; RV64I-NEXT:  .LBB63_2: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB63_3 Depth=1
-; RV64I-NEXT:    sw a1, 4(sp)
+; RV64I-NEXT:    sw a0, 4(sp)
 ; RV64I-NEXT:    addi a1, sp, 4
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a3, 0
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a1, 4(sp)
-; RV64I-NEXT:    bnez a0, .LBB63_8
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 4(sp)
+; RV64I-NEXT:    bnez a1, .LBB63_8
 ; RV64I-NEXT:  .LBB63_3: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a2, a1
-; RV64I-NEXT:    bltu a1, s1, .LBB63_2
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    bltu a0, s1, .LBB63_2
 ; RV64I-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB63_3 Depth=1
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    j .LBB63_2
 ; RV64I-NEXT:  .LBB63_5: # %else
-; RV64I-NEXT:    lw a1, 0(s0)
+; RV64I-NEXT:    lw a0, 0(s0)
 ; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    bltu a1, a2, .LBB63_7
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    bltu a0, a2, .LBB63_7
 ; RV64I-NEXT:  # %bb.6: # %else
-; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    li a1, 1
 ; RV64I-NEXT:  .LBB63_7: # %else
-; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    sw a1, 0(s0)
 ; RV64I-NEXT:  .LBB63_8: # %merge
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -5484,22 +5478,22 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ;
 ; RV64IA-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
 ; RV64IA:       # %bb.0:
-; RV64IA-NEXT:    andi a2, a1, 1
-; RV64IA-NEXT:    mv a1, a0
-; RV64IA-NEXT:    beqz a2, .LBB63_2
+; RV64IA-NEXT:    andi a1, a1, 1
+; RV64IA-NEXT:    beqz a1, .LBB63_2
 ; RV64IA-NEXT:  # %bb.1: # %then
-; RV64IA-NEXT:    li a0, 1
-; RV64IA-NEXT:    amominu.w a0, a0, (a1)
+; RV64IA-NEXT:    li a1, 1
+; RV64IA-NEXT:    amominu.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
 ; RV64IA-NEXT:  .LBB63_2: # %else
-; RV64IA-NEXT:    lw a0, 0(a1)
+; RV64IA-NEXT:    lw a1, 0(a0)
 ; RV64IA-NEXT:    li a3, 1
-; RV64IA-NEXT:    mv a2, a0
-; RV64IA-NEXT:    bltu a0, a3, .LBB63_4
+; RV64IA-NEXT:    mv a2, a1
+; RV64IA-NEXT:    bltu a1, a3, .LBB63_4
 ; RV64IA-NEXT:  # %bb.3: # %else
 ; RV64IA-NEXT:    li a2, 1
 ; RV64IA-NEXT:  .LBB63_4: # %else
-; RV64IA-NEXT:    sw a2, 0(a1)
+; RV64IA-NEXT:    sw a2, 0(a0)
+; RV64IA-NEXT:    mv a0, a1
 ; RV64IA-NEXT:    ret
   br i1 %c, label %then, label %else
 
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index 34b29ea1dc6c2..82e64c9cb5f65 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -26,27 +26,27 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    sltu a0, a0, s2
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, s1
-; RV32I-NEXT:    sub a2, a3, a0
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    sltu a1, a1, s2
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    beqz a0, .LBB0_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    beqz a1, .LBB0_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -65,9 +65,9 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV32IA-NEXT:    slli a3, a0, 3
 ; RV32IA-NEXT:    li a4, 255
 ; RV32IA-NEXT:    andi a0, a3, 24
-; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    andi a4, a1, 255
 ; RV32IA-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -109,27 +109,27 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    sltu a0, a0, s2
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, s1
-; RV64I-NEXT:    sub a2, a3, a0
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    sltu a1, a1, s2
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a1, a1, s0
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    beqz a0, .LBB0_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    beqz a1, .LBB0_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -145,18 +145,18 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_usub_cond_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    li a5, 255
-; RV64IA-NEXT:    andi a0, a4, 24
-; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    sllw a4, a5, a4
-; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    slli a3, a0, 3
+; RV64IA-NEXT:    li a4, 255
+; RV64IA-NEXT:    andi a0, a3, 24
+; RV64IA-NEXT:    sllw a3, a4, a3
+; RV64IA-NEXT:    not a3, a3
+; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    andi a5, a1, 255
 ; RV64IA-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB0_3 Depth 2
-; RV64IA-NEXT:    srlw a6, a3, a0
-; RV64IA-NEXT:    sext.w a7, a3
+; RV64IA-NEXT:    srlw a6, a4, a0
+; RV64IA-NEXT:    sext.w a7, a4
 ; RV64IA-NEXT:    andi t0, a6, 255
 ; RV64IA-NEXT:    sltu t0, t0, a5
 ; RV64IA-NEXT:    addi t0, t0, -1
@@ -164,20 +164,20 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    subw a6, a6, t0
 ; RV64IA-NEXT:    andi a6, a6, 255
 ; RV64IA-NEXT:    sllw a6, a6, a0
-; RV64IA-NEXT:    and a3, a3, a4
-; RV64IA-NEXT:    or a6, a3, a6
+; RV64IA-NEXT:    and a4, a4, a3
+; RV64IA-NEXT:    or a6, a4, a6
 ; RV64IA-NEXT:  .LBB0_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB0_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NEXT:    bne a3, a7, .LBB0_1
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, a7, .LBB0_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB0_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t0, a6, (a2)
 ; RV64IA-NEXT:    bnez t0, .LBB0_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a4, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -200,27 +200,27 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    .cfi_offset s3, -20
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    sltu a0, a0, s3
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, s0
-; RV32I-NEXT:    sub a2, a1, a0
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    sltu a1, a1, s3
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    beqz a0, .LBB1_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    beqz a1, .LBB1_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -242,9 +242,9 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
-; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
 ; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    and a5, a1, a3
 ; RV32IA-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -290,27 +290,27 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    .cfi_offset s3, -40
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    sltu a0, a0, s3
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, s0
-; RV64I-NEXT:    sub a2, a1, a0
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    sltu a1, a1, s3
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a1, a1, s0
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    beqz a0, .LBB1_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    beqz a1, .LBB1_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -328,19 +328,19 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64IA-LABEL: atomicrmw_usub_cond_i16:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
-; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
-; RV64IA-NEXT:    lw a4, 0(a2)
-; RV64IA-NEXT:    sllw a5, a3, a5
-; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    sllw a4, a3, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    lw a5, 0(a2)
 ; RV64IA-NEXT:    and a6, a1, a3
 ; RV64IA-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB1_3 Depth 2
-; RV64IA-NEXT:    srlw a7, a4, a0
-; RV64IA-NEXT:    sext.w t0, a4
+; RV64IA-NEXT:    srlw a7, a5, a0
+; RV64IA-NEXT:    sext.w t0, a5
 ; RV64IA-NEXT:    and t1, a7, a3
 ; RV64IA-NEXT:    sltu t1, t1, a6
 ; RV64IA-NEXT:    addi t1, t1, -1
@@ -348,20 +348,20 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    subw a7, a7, t1
 ; RV64IA-NEXT:    and a7, a7, a3
 ; RV64IA-NEXT:    sllw a7, a7, a0
-; RV64IA-NEXT:    and a4, a4, a5
-; RV64IA-NEXT:    or a7, a4, a7
+; RV64IA-NEXT:    and a5, a5, a4
+; RV64IA-NEXT:    or a7, a5, a7
 ; RV64IA-NEXT:  .LBB1_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB1_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
-; RV64IA-NEXT:    bne a4, t0, .LBB1_1
+; RV64IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NEXT:    bne a5, t0, .LBB1_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB1_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t1, a7, (a2)
 ; RV64IA-NEXT:    bnez t1, .LBB1_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -378,25 +378,25 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:  .LBB2_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    sltu a0, a3, s1
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, s1
-; RV32I-NEXT:    sub a2, a3, a0
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sltu a1, a0, s0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a1, a1, s0
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    beqz a0, .LBB2_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    beqz a1, .LBB2_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -444,26 +444,26 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:  .LBB2_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sltu a0, a3, s2
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, s1
-; RV64I-NEXT:    subw a2, a3, a0
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sltu a1, a0, s2
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a1, a1, s0
+; RV64I-NEXT:    subw a2, a0, a1
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    beqz a0, .LBB2_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    beqz a1, .LBB2_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -519,43 +519,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s0
+; RV32I-NEXT:    sltu a2, a1, s0
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    xori a0, a0, 1
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    and a1, a0, s2
-; RV32I-NEXT:    and a0, a0, s0
-; RV32I-NEXT:    sltu a3, a4, a1
-; RV32I-NEXT:    sub a0, a5, a0
-; RV32I-NEXT:    sub a2, a4, a1
-; RV32I-NEXT:    sub a3, a0, a3
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    xori a2, a2, 1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    and a3, a2, s1
+; RV32I-NEXT:    and a2, a2, s0
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    sub a2, a0, a3
+; RV32I-NEXT:    sub a3, a5, a4
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB3_5
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a5, s0, .LBB3_1
+; RV32I-NEXT:    bne a1, s0, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sltu a2, a0, s1
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -581,43 +580,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s0
+; RV32IA-NEXT:    sltu a2, a1, s0
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    xori a0, a0, 1
-; RV32IA-NEXT:    neg a0, a0
-; RV32IA-NEXT:    and a1, a0, s2
-; RV32IA-NEXT:    and a0, a0, s0
-; RV32IA-NEXT:    sltu a3, a4, a1
-; RV32IA-NEXT:    sub a0, a5, a0
-; RV32IA-NEXT:    sub a2, a4, a1
-; RV32IA-NEXT:    sub a3, a0, a3
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    xori a2, a2, 1
+; RV32IA-NEXT:    neg a2, a2
+; RV32IA-NEXT:    and a3, a2, s1
+; RV32IA-NEXT:    and a2, a2, s0
+; RV32IA-NEXT:    sltu a4, a0, a3
+; RV32IA-NEXT:    sub a5, a1, a2
+; RV32IA-NEXT:    sub a2, a0, a3
+; RV32IA-NEXT:    sub a3, a5, a4
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB3_5
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a5, s0, .LBB3_1
+; RV32IA-NEXT:    bne a1, s0, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sltu a2, a0, s1
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -640,25 +638,25 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sltu a0, a3, s1
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, s1
-; RV64I-NEXT:    sub a2, a3, a0
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sltu a1, a0, s0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a1, a1, s0
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    beqz a0, .LBB3_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    beqz a1, .LBB3_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -709,25 +707,25 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s1, a1, 255
 ; RV32I-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    sub a1, a0, s1
-; RV32I-NEXT:    sltu a0, a0, a1
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a2, a0, a1
-; RV32I-NEXT:    sb a3, 3(sp)
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    sub a2, a1, s1
+; RV32I-NEXT:    sltu a1, a1, a2
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    sb a0, 3(sp)
 ; RV32I-NEXT:    addi a1, sp, 3
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 3(sp)
-; RV32I-NEXT:    beqz a0, .LBB4_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 3(sp)
+; RV32I-NEXT:    beqz a1, .LBB4_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -741,12 +739,12 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32IA-LABEL: atomicrmw_usub_sat_i8:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
-; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
-; RV32IA-NEXT:    lw a4, 0(a2)
-; RV32IA-NEXT:    andi a0, a0, 24
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
+; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    lw a4, 0(a2)
 ; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -786,25 +784,25 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s1, a1, 255
 ; RV64I-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    sub a1, a0, s1
-; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    sb a3, 7(sp)
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    sub a2, a1, s1
+; RV64I-NEXT:    sltu a1, a1, a2
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a2, a1, a2
+; RV64I-NEXT:    sb a0, 7(sp)
 ; RV64I-NEXT:    addi a1, sp, 7
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 7(sp)
-; RV64I-NEXT:    beqz a0, .LBB4_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 7(sp)
+; RV64I-NEXT:    beqz a1, .LBB4_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -818,38 +816,38 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_usub_sat_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a4, a3, a0
-; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    andi a0, a0, 24
-; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    slli a3, a0, 3
+; RV64IA-NEXT:    li a4, 255
+; RV64IA-NEXT:    andi a0, a3, 24
+; RV64IA-NEXT:    sllw a3, a4, a3
+; RV64IA-NEXT:    not a3, a3
+; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB4_3 Depth 2
-; RV64IA-NEXT:    srlw a5, a3, a0
-; RV64IA-NEXT:    sext.w a6, a3
+; RV64IA-NEXT:    srlw a5, a4, a0
+; RV64IA-NEXT:    sext.w a6, a4
 ; RV64IA-NEXT:    andi a5, a5, 255
 ; RV64IA-NEXT:    sub a7, a5, a1
 ; RV64IA-NEXT:    sltu a5, a5, a7
 ; RV64IA-NEXT:    addi a5, a5, -1
 ; RV64IA-NEXT:    and a5, a5, a7
 ; RV64IA-NEXT:    sllw a5, a5, a0
-; RV64IA-NEXT:    and a3, a3, a4
-; RV64IA-NEXT:    or a5, a3, a5
+; RV64IA-NEXT:    and a4, a4, a3
+; RV64IA-NEXT:    or a5, a4, a5
 ; RV64IA-NEXT:  .LBB4_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB4_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NEXT:    bne a3, a6, .LBB4_1
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, a6, .LBB4_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB4_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl a7, a5, (a2)
 ; RV64IA-NEXT:    bnez a7, .LBB4_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a4, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -869,27 +867,27 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s1, 16
 ; RV32I-NEXT:    addi s1, s1, -1
 ; RV32I-NEXT:    and s2, a1, s1
 ; RV32I-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a3, s1
-; RV32I-NEXT:    sub a1, a0, s2
-; RV32I-NEXT:    sltu a0, a0, a1
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a2, a0, a1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    and a1, a0, s1
+; RV32I-NEXT:    sub a2, a1, s2
+; RV32I-NEXT:    sltu a1, a1, a2
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    beqz a0, .LBB5_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    beqz a1, .LBB5_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -909,9 +907,9 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
-; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
 ; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    and a1, a1, a3
 ; RV32IA-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -953,27 +951,27 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s1, 16
 ; RV64I-NEXT:    addiw s1, s1, -1
 ; RV64I-NEXT:    and s2, a1, s1
 ; RV64I-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a3, s1
-; RV64I-NEXT:    sub a1, a0, s2
-; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a2, a0, a1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    and a1, a0, s1
+; RV64I-NEXT:    sub a2, a1, s2
+; RV64I-NEXT:    sltu a1, a1, a2
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a2, a1, a2
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    beqz a0, .LBB5_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    beqz a1, .LBB5_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -989,39 +987,39 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64IA-LABEL: atomicrmw_usub_sat_i16:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
-; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
-; RV64IA-NEXT:    lw a4, 0(a2)
-; RV64IA-NEXT:    sllw a5, a3, a5
-; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    sllw a4, a3, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    lw a5, 0(a2)
 ; RV64IA-NEXT:    and a1, a1, a3
 ; RV64IA-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB5_3 Depth 2
-; RV64IA-NEXT:    srlw a6, a4, a0
-; RV64IA-NEXT:    sext.w a7, a4
+; RV64IA-NEXT:    srlw a6, a5, a0
+; RV64IA-NEXT:    sext.w a7, a5
 ; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    sub t0, a6, a1
 ; RV64IA-NEXT:    sltu a6, a6, t0
 ; RV64IA-NEXT:    addi a6, a6, -1
 ; RV64IA-NEXT:    and a6, a6, t0
 ; RV64IA-NEXT:    sllw a6, a6, a0
-; RV64IA-NEXT:    and a4, a4, a5
-; RV64IA-NEXT:    or a6, a4, a6
+; RV64IA-NEXT:    and a5, a5, a4
+; RV64IA-NEXT:    or a6, a5, a6
 ; RV64IA-NEXT:  .LBB5_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB5_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
-; RV64IA-NEXT:    bne a4, a7, .LBB5_1
+; RV64IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NEXT:    bne a5, a7, .LBB5_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB5_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t0, a6, (a2)
 ; RV64IA-NEXT:    bnez t0, .LBB5_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -1038,25 +1036,25 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    sub a0, a3, s1
-; RV32I-NEXT:    sltu a1, a3, a0
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a2, a1, a0
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sub a1, a0, s0
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    and a2, a2, a1
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    beqz a0, .LBB6_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    beqz a1, .LBB6_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1102,25 +1100,25 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    subw a0, a3, s1
-; RV64I-NEXT:    sltu a1, a3, a0
-; RV64I-NEXT:    addi a1, a1, -1
-; RV64I-NEXT:    and a2, a1, a0
-; RV64I-NEXT:    sw a3, 4(sp)
+; RV64I-NEXT:    subw a1, a0, s0
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    sw a0, 4(sp)
 ; RV64I-NEXT:    addi a1, sp, 4
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 4(sp)
-; RV64I-NEXT:    beqz a0, .LBB6_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 4(sp)
+; RV64I-NEXT:    beqz a1, .LBB6_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -1173,42 +1171,41 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB7_3
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a5, a0
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    addi a3, a2, -1
-; RV32I-NEXT:    and a2, a3, a1
-; RV32I-NEXT:    and a3, a3, a0
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    addi a4, a4, -1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB7_5
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB7_5
 ; RV32I-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
-; RV32I-NEXT:    sub a1, a5, s0
-; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    sub a1, a4, s2
-; RV32I-NEXT:    bne a0, a5, .LBB7_1
+; RV32I-NEXT:    sltu a2, a0, s1
+; RV32I-NEXT:    sub a3, a1, s0
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    sub a2, a0, s1
+; RV32I-NEXT:    bne a3, a1, .LBB7_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a4, a1
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1234,42 +1231,41 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB7_3
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a5, a0
+; RV32IA-NEXT:    sltu a4, a1, a3
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    addi a3, a2, -1
-; RV32IA-NEXT:    and a2, a3, a1
-; RV32IA-NEXT:    and a3, a3, a0
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    addi a4, a4, -1
+; RV32IA-NEXT:    and a2, a4, a2
+; RV32IA-NEXT:    and a3, a4, a3
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB7_5
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB7_5
 ; RV32IA-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
-; RV32IA-NEXT:    sub a1, a5, s0
-; RV32IA-NEXT:    sub a0, a1, a0
-; RV32IA-NEXT:    sub a1, a4, s2
-; RV32IA-NEXT:    bne a0, a5, .LBB7_1
+; RV32IA-NEXT:    sltu a2, a0, s1
+; RV32IA-NEXT:    sub a3, a1, s0
+; RV32IA-NEXT:    sub a3, a3, a2
+; RV32IA-NEXT:    sub a2, a0, s1
+; RV32IA-NEXT:    bne a3, a1, .LBB7_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a4, a1
+; RV32IA-NEXT:    sltu a4, a0, a2
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1292,25 +1288,25 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    sub a0, a3, s1
-; RV64I-NEXT:    sltu a1, a3, a0
-; RV64I-NEXT:    addi a1, a1, -1
-; RV64I-NEXT:    and a2, a1, a0
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sub a1, a0, s0
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    beqz a0, .LBB7_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    beqz a1, .LBB7_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 3ff01e4987bd5..d67e047e8b05b 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -25,25 +25,25 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s1, a1, 255
 ; RV32I-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    addi a0, a3, 1
-; RV32I-NEXT:    andi a1, a3, 255
-; RV32I-NEXT:    sltu a1, a1, s1
-; RV32I-NEXT:    neg a2, a1
-; RV32I-NEXT:    and a2, a2, a0
-; RV32I-NEXT:    sb a3, 3(sp)
+; RV32I-NEXT:    addi a1, a0, 1
+; RV32I-NEXT:    andi a2, a0, 255
+; RV32I-NEXT:    sltu a2, a2, s1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    and a2, a2, a1
+; RV32I-NEXT:    sb a0, 3(sp)
 ; RV32I-NEXT:    addi a1, sp, 3
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 3(sp)
-; RV32I-NEXT:    beqz a0, .LBB0_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 3(sp)
+; RV32I-NEXT:    beqz a1, .LBB0_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -57,12 +57,12 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32IA-LABEL: atomicrmw_uinc_wrap_i8:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
-; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
-; RV32IA-NEXT:    lw a4, 0(a2)
-; RV32IA-NEXT:    andi a0, a0, 24
+; RV32IA-NEXT:    slli a3, a0, 3
+; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
+; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    lw a4, 0(a2)
 ; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -103,25 +103,25 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s1, a1, 255
 ; RV64I-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    addi a0, a3, 1
-; RV64I-NEXT:    andi a1, a3, 255
-; RV64I-NEXT:    sltu a1, a1, s1
-; RV64I-NEXT:    neg a2, a1
-; RV64I-NEXT:    and a2, a2, a0
-; RV64I-NEXT:    sb a3, 7(sp)
+; RV64I-NEXT:    addi a1, a0, 1
+; RV64I-NEXT:    andi a2, a0, 255
+; RV64I-NEXT:    sltu a2, a2, s1
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    sb a0, 7(sp)
 ; RV64I-NEXT:    addi a1, sp, 7
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 7(sp)
-; RV64I-NEXT:    beqz a0, .LBB0_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 7(sp)
+; RV64I-NEXT:    beqz a1, .LBB0_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -135,18 +135,18 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_uinc_wrap_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a4, a3, a0
-; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    andi a0, a0, 24
-; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    slli a3, a0, 3
+; RV64IA-NEXT:    li a4, 255
+; RV64IA-NEXT:    andi a0, a3, 24
+; RV64IA-NEXT:    sllw a3, a4, a3
+; RV64IA-NEXT:    not a3, a3
+; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:  .LBB0_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB0_3 Depth 2
-; RV64IA-NEXT:    srlw a5, a3, a0
-; RV64IA-NEXT:    sext.w a6, a3
+; RV64IA-NEXT:    srlw a5, a4, a0
+; RV64IA-NEXT:    sext.w a6, a4
 ; RV64IA-NEXT:    andi a7, a5, 255
 ; RV64IA-NEXT:    addi a5, a5, 1
 ; RV64IA-NEXT:    sltu a7, a7, a1
@@ -154,20 +154,20 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    and a5, a7, a5
 ; RV64IA-NEXT:    andi a5, a5, 255
 ; RV64IA-NEXT:    sllw a5, a5, a0
-; RV64IA-NEXT:    and a3, a3, a4
-; RV64IA-NEXT:    or a5, a3, a5
+; RV64IA-NEXT:    and a4, a4, a3
+; RV64IA-NEXT:    or a5, a4, a5
 ; RV64IA-NEXT:  .LBB0_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB0_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NEXT:    bne a3, a6, .LBB0_1
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, a6, .LBB0_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB0_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl a7, a5, (a2)
 ; RV64IA-NEXT:    bnez a7, .LBB0_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a4, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -187,27 +187,27 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lhu a3, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s1, 16
 ; RV32I-NEXT:    addi s1, s1, -1
 ; RV32I-NEXT:    and s2, a1, s1
 ; RV32I-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a3, s1
-; RV32I-NEXT:    addi a1, a3, 1
-; RV32I-NEXT:    sltu a0, a0, s2
-; RV32I-NEXT:    neg a2, a0
-; RV32I-NEXT:    and a2, a2, a1
-; RV32I-NEXT:    sh a3, 14(sp)
+; RV32I-NEXT:    and a1, a0, s1
+; RV32I-NEXT:    addi a2, a0, 1
+; RV32I-NEXT:    sltu a1, a1, s2
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    sh a0, 14(sp)
 ; RV32I-NEXT:    addi a1, sp, 14
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a3, 14(sp)
-; RV32I-NEXT:    beqz a0, .LBB1_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 14(sp)
+; RV32I-NEXT:    beqz a1, .LBB1_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -227,9 +227,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
-; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
 ; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    and a1, a1, a3
 ; RV32IA-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -272,27 +272,27 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lhu a3, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s1, 16
 ; RV64I-NEXT:    addiw s1, s1, -1
 ; RV64I-NEXT:    and s2, a1, s1
 ; RV64I-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a3, s1
-; RV64I-NEXT:    addi a1, a3, 1
-; RV64I-NEXT:    sltu a0, a0, s2
-; RV64I-NEXT:    neg a2, a0
-; RV64I-NEXT:    and a2, a2, a1
-; RV64I-NEXT:    sh a3, 14(sp)
+; RV64I-NEXT:    and a1, a0, s1
+; RV64I-NEXT:    addi a2, a0, 1
+; RV64I-NEXT:    sltu a1, a1, s2
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    and a2, a1, a2
+; RV64I-NEXT:    sh a0, 14(sp)
 ; RV64I-NEXT:    addi a1, sp, 14
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a3, 14(sp)
-; RV64I-NEXT:    beqz a0, .LBB1_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 14(sp)
+; RV64I-NEXT:    beqz a1, .LBB1_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -308,19 +308,19 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-LABEL: atomicrmw_uinc_wrap_i16:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
-; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
-; RV64IA-NEXT:    lw a4, 0(a2)
-; RV64IA-NEXT:    sllw a5, a3, a5
-; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    sllw a4, a3, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    lw a5, 0(a2)
 ; RV64IA-NEXT:    and a1, a1, a3
 ; RV64IA-NEXT:  .LBB1_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB1_3 Depth 2
-; RV64IA-NEXT:    srlw a6, a4, a0
-; RV64IA-NEXT:    sext.w a7, a4
+; RV64IA-NEXT:    srlw a6, a5, a0
+; RV64IA-NEXT:    sext.w a7, a5
 ; RV64IA-NEXT:    and t0, a6, a3
 ; RV64IA-NEXT:    addi a6, a6, 1
 ; RV64IA-NEXT:    sltu t0, t0, a1
@@ -328,20 +328,20 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    negw t0, t0
 ; RV64IA-NEXT:    and a6, t0, a6
 ; RV64IA-NEXT:    sllw a6, a6, a0
-; RV64IA-NEXT:    and a4, a4, a5
-; RV64IA-NEXT:    or a6, a4, a6
+; RV64IA-NEXT:    and a5, a5, a4
+; RV64IA-NEXT:    or a6, a5, a6
 ; RV64IA-NEXT:  .LBB1_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB1_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
-; RV64IA-NEXT:    bne a4, a7, .LBB1_1
+; RV64IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NEXT:    bne a5, a7, .LBB1_1
 ; RV64IA-NEXT:  # %bb.4: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB1_3 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t0, a6, (a2)
 ; RV64IA-NEXT:    bnez t0, .LBB1_3
 ; RV64IA-NEXT:  # %bb.5: # %atomicrmw.start
 ; RV64IA-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -358,25 +358,25 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:  .LBB2_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    addi a0, a3, 1
-; RV32I-NEXT:    sltu a1, a3, s1
-; RV32I-NEXT:    neg a2, a1
-; RV32I-NEXT:    and a2, a2, a0
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    addi a1, a0, 1
+; RV32I-NEXT:    sltu a2, a0, s0
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    and a2, a2, a1
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    beqz a0, .LBB2_1
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    beqz a1, .LBB2_1
 ; RV32I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -423,24 +423,24 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s1, a1
 ; RV64I-NEXT:  .LBB2_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    addiw a0, a3, 1
-; RV64I-NEXT:    sltu a1, a3, s1
-; RV64I-NEXT:    neg a2, a1
-; RV64I-NEXT:    and a2, a2, a0
-; RV64I-NEXT:    sw a3, 4(sp)
+; RV64I-NEXT:    addiw a1, a0, 1
+; RV64I-NEXT:    sltu a2, a0, s1
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    sw a0, 4(sp)
 ; RV64I-NEXT:    addi a1, sp, 4
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 4(sp)
-; RV64I-NEXT:    beqz a0, .LBB2_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 4(sp)
+; RV64I-NEXT:    beqz a1, .LBB2_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -494,41 +494,40 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s0
+; RV32I-NEXT:    sltu a2, a1, s0
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    addi a1, a4, 1
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    seqz a3, a1
-; RV32I-NEXT:    and a2, a0, a1
-; RV32I-NEXT:    add a3, a5, a3
-; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    addi a3, a0, 1
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    seqz a5, a3
+; RV32I-NEXT:    and a2, a4, a3
+; RV32I-NEXT:    add a3, a1, a5
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a4, 8(sp)
-; RV32I-NEXT:    lw a5, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB3_5
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a5, s0, .LBB3_1
+; RV32I-NEXT:    bne a1, s0, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sltu a2, a0, s1
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -554,41 +553,40 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a4, 0(a0)
-; RV32IA-NEXT:    lw a5, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s0
+; RV32IA-NEXT:    sltu a2, a1, s0
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    addi a1, a4, 1
-; RV32IA-NEXT:    neg a0, a0
-; RV32IA-NEXT:    seqz a3, a1
-; RV32IA-NEXT:    and a2, a0, a1
-; RV32IA-NEXT:    add a3, a5, a3
-; RV32IA-NEXT:    and a3, a0, a3
-; RV32IA-NEXT:    sw a4, 8(sp)
-; RV32IA-NEXT:    sw a5, 12(sp)
+; RV32IA-NEXT:    addi a3, a0, 1
+; RV32IA-NEXT:    neg a4, a2
+; RV32IA-NEXT:    seqz a5, a3
+; RV32IA-NEXT:    and a2, a4, a3
+; RV32IA-NEXT:    add a3, a1, a5
+; RV32IA-NEXT:    and a3, a4, a3
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a4, 8(sp)
-; RV32IA-NEXT:    lw a5, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB3_5
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a5, s0, .LBB3_1
+; RV32IA-NEXT:    bne a1, s0, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sltu a2, a0, s1
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a4
-; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -611,25 +609,25 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    addi a0, a3, 1
-; RV64I-NEXT:    sltu a1, a3, s1
-; RV64I-NEXT:    neg a2, a1
-; RV64I-NEXT:    and a2, a2, a0
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    addi a1, a0, 1
+; RV64I-NEXT:    sltu a2, a0, s0
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    beqz a0, .LBB3_1
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    beqz a1, .LBB3_1
 ; RV64I-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -681,35 +679,35 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    andi s2, a1, 255
 ; RV32I-NEXT:    j .LBB4_2
 ; RV32I-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV32I-NEXT:    sb a3, 15(sp)
+; RV32I-NEXT:    sb a0, 15(sp)
 ; RV32I-NEXT:    addi a1, sp, 15
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_1
-; RV32I-NEXT:    lbu a3, 15(sp)
-; RV32I-NEXT:    bnez a0, .LBB4_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lbu a0, 15(sp)
+; RV32I-NEXT:    bnez a1, .LBB4_4
 ; RV32I-NEXT:  .LBB4_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    andi a0, a3, 255
-; RV32I-NEXT:    seqz a1, a0
-; RV32I-NEXT:    sltu a0, s2, a0
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    mv a2, s1
-; RV32I-NEXT:    bnez a0, .LBB4_1
+; RV32I-NEXT:    andi a1, a0, 255
+; RV32I-NEXT:    seqz a2, a1
+; RV32I-NEXT:    sltu a1, s2, a1
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    mv a2, s0
+; RV32I-NEXT:    bnez a1, .LBB4_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    j .LBB4_1
 ; RV32I-NEXT:  .LBB4_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -728,9 +726,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32IA-NEXT:    slli a3, a0, 3
 ; RV32IA-NEXT:    li a4, 255
 ; RV32IA-NEXT:    andi a0, a3, 24
-; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
+; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    andi a4, a1, 255
 ; RV32IA-NEXT:    j .LBB4_2
 ; RV32IA-NEXT:  .LBB4_1: # %atomicrmw.start
@@ -782,35 +780,35 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    andi s2, a1, 255
 ; RV64I-NEXT:    j .LBB4_2
 ; RV64I-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV64I-NEXT:    sb a3, 15(sp)
+; RV64I-NEXT:    sb a0, 15(sp)
 ; RV64I-NEXT:    addi a1, sp, 15
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_1
-; RV64I-NEXT:    lbu a3, 15(sp)
-; RV64I-NEXT:    bnez a0, .LBB4_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lbu a0, 15(sp)
+; RV64I-NEXT:    bnez a1, .LBB4_4
 ; RV64I-NEXT:  .LBB4_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    andi a0, a3, 255
-; RV64I-NEXT:    seqz a1, a0
-; RV64I-NEXT:    sltu a0, s2, a0
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    mv a2, s1
-; RV64I-NEXT:    bnez a0, .LBB4_1
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    seqz a2, a1
+; RV64I-NEXT:    sltu a1, s2, a1
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    mv a2, s0
+; RV64I-NEXT:    bnez a1, .LBB4_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV64I-NEXT:    addi a2, a3, -1
+; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    j .LBB4_1
 ; RV64I-NEXT:  .LBB4_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -826,37 +824,37 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_udec_wrap_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    li a5, 255
-; RV64IA-NEXT:    andi a0, a4, 24
-; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    sllw a4, a5, a4
-; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    slli a3, a0, 3
+; RV64IA-NEXT:    li a4, 255
+; RV64IA-NEXT:    andi a0, a3, 24
+; RV64IA-NEXT:    sllw a3, a4, a3
+; RV64IA-NEXT:    not a3, a3
+; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    andi a5, a1, 255
 ; RV64IA-NEXT:    j .LBB4_2
 ; RV64IA-NEXT:  .LBB4_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV64IA-NEXT:    sext.w a6, a3
+; RV64IA-NEXT:    sext.w a6, a4
 ; RV64IA-NEXT:    andi a7, a7, 255
 ; RV64IA-NEXT:    sllw a7, a7, a0
-; RV64IA-NEXT:    and a3, a3, a4
-; RV64IA-NEXT:    or a7, a3, a7
+; RV64IA-NEXT:    and a4, a4, a3
+; RV64IA-NEXT:    or a7, a4, a7
 ; RV64IA-NEXT:  .LBB4_5: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB4_2 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NEXT:    bne a3, a6, .LBB4_7
+; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
+; RV64IA-NEXT:    bne a4, a6, .LBB4_7
 ; RV64IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB4_5 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t0, a7, (a2)
 ; RV64IA-NEXT:    bnez t0, .LBB4_5
 ; RV64IA-NEXT:  .LBB4_7: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB4_2 Depth=1
-; RV64IA-NEXT:    beq a3, a6, .LBB4_4
+; RV64IA-NEXT:    beq a4, a6, .LBB4_4
 ; RV64IA-NEXT:  .LBB4_2: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB4_5 Depth 2
-; RV64IA-NEXT:    srlw a6, a3, a0
+; RV64IA-NEXT:    srlw a6, a4, a0
 ; RV64IA-NEXT:    andi a7, a6, 255
 ; RV64IA-NEXT:    seqz t0, a7
 ; RV64IA-NEXT:    sltu a7, a5, a7
@@ -868,7 +866,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    addi a7, a6, -1
 ; RV64IA-NEXT:    j .LBB4_1
 ; RV64IA-NEXT:  .LBB4_4: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a4, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -891,35 +889,35 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    .cfi_offset s3, -20
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lhu a1, 0(a0)
+; RV32I-NEXT:    lhu a0, 0(a0)
 ; RV32I-NEXT:    lui s2, 16
 ; RV32I-NEXT:    addi s2, s2, -1
-; RV32I-NEXT:    and s3, s0, s2
+; RV32I-NEXT:    and s3, a1, s2
 ; RV32I-NEXT:    j .LBB5_2
 ; RV32I-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV32I-NEXT:    sh a1, 10(sp)
+; RV32I-NEXT:    sh a0, 10(sp)
 ; RV32I-NEXT:    addi a1, sp, 10
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_2
-; RV32I-NEXT:    lh a1, 10(sp)
-; RV32I-NEXT:    bnez a0, .LBB5_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lh a0, 10(sp)
+; RV32I-NEXT:    bnez a1, .LBB5_4
 ; RV32I-NEXT:  .LBB5_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    and a0, a1, s2
-; RV32I-NEXT:    seqz a2, a0
-; RV32I-NEXT:    sltu a0, s3, a0
-; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    and a1, a0, s2
+; RV32I-NEXT:    seqz a2, a1
+; RV32I-NEXT:    sltu a1, s3, a1
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    mv a2, s0
-; RV32I-NEXT:    bnez a0, .LBB5_1
+; RV32I-NEXT:    bnez a1, .LBB5_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV32I-NEXT:    addi a2, a1, -1
+; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    j .LBB5_1
 ; RV32I-NEXT:  .LBB5_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -941,9 +939,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
-; RV32IA-NEXT:    lw a7, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
 ; RV32IA-NEXT:    not a4, a4
+; RV32IA-NEXT:    lw a7, 0(a2)
 ; RV32IA-NEXT:    and a5, a1, a3
 ; RV32IA-NEXT:    j .LBB5_2
 ; RV32IA-NEXT:  .LBB5_1: # %atomicrmw.start
@@ -999,35 +997,35 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    .cfi_offset s3, -40
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lhu a1, 0(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
 ; RV64I-NEXT:    lui s2, 16
 ; RV64I-NEXT:    addiw s2, s2, -1
-; RV64I-NEXT:    and s3, s0, s2
+; RV64I-NEXT:    and s3, a1, s2
 ; RV64I-NEXT:    j .LBB5_2
 ; RV64I-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV64I-NEXT:    sh a1, 6(sp)
+; RV64I-NEXT:    sh a0, 6(sp)
 ; RV64I-NEXT:    addi a1, sp, 6
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_2
-; RV64I-NEXT:    lh a1, 6(sp)
-; RV64I-NEXT:    bnez a0, .LBB5_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lh a0, 6(sp)
+; RV64I-NEXT:    bnez a1, .LBB5_4
 ; RV64I-NEXT:  .LBB5_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    and a0, a1, s2
-; RV64I-NEXT:    seqz a2, a0
-; RV64I-NEXT:    sltu a0, s3, a0
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    and a1, a0, s2
+; RV64I-NEXT:    seqz a2, a1
+; RV64I-NEXT:    sltu a1, s3, a1
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    mv a2, s0
-; RV64I-NEXT:    bnez a0, .LBB5_1
+; RV64I-NEXT:    bnez a1, .LBB5_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV64I-NEXT:    addi a2, a1, -1
+; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    j .LBB5_1
 ; RV64I-NEXT:  .LBB5_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1045,38 +1043,38 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-LABEL: atomicrmw_udec_wrap_i16:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a5, a0, 3
+; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
-; RV64IA-NEXT:    andi a0, a5, 24
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
-; RV64IA-NEXT:    lw a4, 0(a2)
-; RV64IA-NEXT:    sllw a5, a3, a5
-; RV64IA-NEXT:    not a5, a5
+; RV64IA-NEXT:    sllw a4, a3, a4
+; RV64IA-NEXT:    not a4, a4
+; RV64IA-NEXT:    lw a5, 0(a2)
 ; RV64IA-NEXT:    and a6, a1, a3
 ; RV64IA-NEXT:    j .LBB5_2
 ; RV64IA-NEXT:  .LBB5_1: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV64IA-NEXT:    sext.w a7, a4
+; RV64IA-NEXT:    sext.w a7, a5
 ; RV64IA-NEXT:    and t0, t0, a3
 ; RV64IA-NEXT:    sllw t0, t0, a0
-; RV64IA-NEXT:    and a4, a4, a5
-; RV64IA-NEXT:    or t0, a4, t0
+; RV64IA-NEXT:    and a5, a5, a4
+; RV64IA-NEXT:    or t0, a5, t0
 ; RV64IA-NEXT:  .LBB5_5: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB5_2 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64IA-NEXT:    lr.w.aqrl a4, (a2)
-; RV64IA-NEXT:    bne a4, a7, .LBB5_7
+; RV64IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NEXT:    bne a5, a7, .LBB5_7
 ; RV64IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB5_5 Depth=2
 ; RV64IA-NEXT:    sc.w.rl t1, t0, (a2)
 ; RV64IA-NEXT:    bnez t1, .LBB5_5
 ; RV64IA-NEXT:  .LBB5_7: # %atomicrmw.start
 ; RV64IA-NEXT:    # in Loop: Header=BB5_2 Depth=1
-; RV64IA-NEXT:    beq a4, a7, .LBB5_4
+; RV64IA-NEXT:    beq a5, a7, .LBB5_4
 ; RV64IA-NEXT:  .LBB5_2: # %atomicrmw.start
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB5_5 Depth 2
-; RV64IA-NEXT:    srlw a7, a4, a0
+; RV64IA-NEXT:    srlw a7, a5, a0
 ; RV64IA-NEXT:    and t0, a7, a3
 ; RV64IA-NEXT:    seqz t1, t0
 ; RV64IA-NEXT:    sltu t0, a6, t0
@@ -1088,7 +1086,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    addi t0, a7, -1
 ; RV64IA-NEXT:    j .LBB5_1
 ; RV64IA-NEXT:  .LBB5_4: # %atomicrmw.end
-; RV64IA-NEXT:    srlw a0, a4, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -1105,33 +1103,33 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    j .LBB6_2
 ; RV32I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB6_2 Depth=1
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a0, 0(sp)
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    li a3, 5
 ; RV32I-NEXT:    li a4, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_4
-; RV32I-NEXT:    lw a3, 0(sp)
-; RV32I-NEXT:    bnez a0, .LBB6_4
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    lw a0, 0(sp)
+; RV32I-NEXT:    bnez a1, .LBB6_4
 ; RV32I-NEXT:  .LBB6_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    seqz a0, a3
-; RV32I-NEXT:    sltu a1, s1, a3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    mv a2, s1
-; RV32I-NEXT:    bnez a0, .LBB6_1
+; RV32I-NEXT:    seqz a1, a0
+; RV32I-NEXT:    sltu a2, s0, a0
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    mv a2, s0
+; RV32I-NEXT:    bnez a1, .LBB6_1
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB6_2 Depth=1
-; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    j .LBB6_1
 ; RV32I-NEXT:  .LBB6_4: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1189,34 +1187,34 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
 ; RV64I-NEXT:    .cfi_offset s2, -32
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lw a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    sext.w s2, a1
 ; RV64I-NEXT:    j .LBB6_2
 ; RV64I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB6_2 Depth=1
-; RV64I-NEXT:    sw a3, 12(sp)
+; RV64I-NEXT:    sw a0, 12(sp)
 ; RV64I-NEXT:    addi a1, sp, 12
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_4
-; RV64I-NEXT:    lw a3, 12(sp)
-; RV64I-NEXT:    bnez a0, .LBB6_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    bnez a1, .LBB6_4
 ; RV64I-NEXT:  .LBB6_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    seqz a0, a3
-; RV64I-NEXT:    sltu a1, s2, a3
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    mv a2, s1
-; RV64I-NEXT:    bnez a0, .LBB6_1
+; RV64I-NEXT:    seqz a1, a0
+; RV64I-NEXT:    sltu a2, s2, a0
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    mv a2, s0
+; RV64I-NEXT:    bnez a1, .LBB6_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB6_2 Depth=1
-; RV64I-NEXT:    addiw a2, a3, -1
+; RV64I-NEXT:    addiw a2, a0, -1
 ; RV64I-NEXT:    j .LBB6_1
 ; RV64I-NEXT:  .LBB6_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1282,49 +1280,48 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a1, 4(s2)
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
-; RV32I-NEXT:    bnez a0, .LBB7_7
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    lw a0, 8(sp)
+; RV32I-NEXT:    lw a1, 12(sp)
+; RV32I-NEXT:    bnez a2, .LBB7_7
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s0, .LBB7_4
+; RV32I-NEXT:    beq a1, s0, .LBB7_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s0, a4
+; RV32I-NEXT:    sltu a2, s0, a1
 ; RV32I-NEXT:    j .LBB7_5
 ; RV32I-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a2, s1, a0
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    or a1, a5, a4
-; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    mv a2, s2
+; RV32I-NEXT:    or a3, a0, a1
+; RV32I-NEXT:    seqz a3, a3
+; RV32I-NEXT:    or a4, a3, a2
+; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
-; RV32I-NEXT:    bnez a0, .LBB7_1
+; RV32I-NEXT:    bnez a4, .LBB7_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    seqz a0, a5
-; RV32I-NEXT:    sub a3, a4, a0
-; RV32I-NEXT:    addi a2, a5, -1
+; RV32I-NEXT:    seqz a2, a0
+; RV32I-NEXT:    sub a3, a1, a2
+; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    j .LBB7_1
 ; RV32I-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1350,49 +1347,48 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a2
-; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s2, a1
+; RV32IA-NEXT:    mv s1, a1
+; RV32IA-NEXT:    mv s2, a0
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    lw a1, 4(s2)
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a0, 8(sp)
+; RV32IA-NEXT:    sw a1, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s1
+; RV32IA-NEXT:    mv a0, s2
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
-; RV32IA-NEXT:    bnez a0, .LBB7_7
+; RV32IA-NEXT:    mv a2, a0
+; RV32IA-NEXT:    lw a0, 8(sp)
+; RV32IA-NEXT:    lw a1, 12(sp)
+; RV32IA-NEXT:    bnez a2, .LBB7_7
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s0, .LBB7_4
+; RV32IA-NEXT:    beq a1, s0, .LBB7_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s0, a4
+; RV32IA-NEXT:    sltu a2, s0, a1
 ; RV32IA-NEXT:    j .LBB7_5
 ; RV32IA-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a2, s1, a0
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    or a1, a5, a4
-; RV32IA-NEXT:    seqz a1, a1
-; RV32IA-NEXT:    or a0, a1, a0
-; RV32IA-NEXT:    mv a2, s2
+; RV32IA-NEXT:    or a3, a0, a1
+; RV32IA-NEXT:    seqz a3, a3
+; RV32IA-NEXT:    or a4, a3, a2
+; RV32IA-NEXT:    mv a2, s1
 ; RV32IA-NEXT:    mv a3, s0
-; RV32IA-NEXT:    bnez a0, .LBB7_1
+; RV32IA-NEXT:    bnez a4, .LBB7_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    seqz a0, a5
-; RV32IA-NEXT:    sub a3, a4, a0
-; RV32IA-NEXT:    addi a2, a5, -1
+; RV32IA-NEXT:    seqz a2, a0
+; RV32IA-NEXT:    sub a3, a1, a2
+; RV32IA-NEXT:    addi a2, a0, -1
 ; RV32IA-NEXT:    j .LBB7_1
 ; RV32IA-NEXT:  .LBB7_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1415,33 +1411,33 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    mv s1, a1
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    j .LBB7_2
 ; RV64I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    sd a0, 0(sp)
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    li a3, 5
 ; RV64I-NEXT:    li a4, 5
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __atomic_compare_exchange_8
-; RV64I-NEXT:    ld a3, 0(sp)
-; RV64I-NEXT:    bnez a0, .LBB7_4
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    ld a0, 0(sp)
+; RV64I-NEXT:    bnez a1, .LBB7_4
 ; RV64I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    seqz a0, a3
-; RV64I-NEXT:    sltu a1, s1, a3
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    mv a2, s1
-; RV64I-NEXT:    bnez a0, .LBB7_1
+; RV64I-NEXT:    seqz a1, a0
+; RV64I-NEXT:    sltu a2, s0, a0
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    mv a2, s0
+; RV64I-NEXT:    bnez a1, .LBB7_1
 ; RV64I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV64I-NEXT:    addi a2, a3, -1
+; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    j .LBB7_1
 ; RV64I-NEXT:  .LBB7_4: # %atomicrmw.end
-; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/bf16-promote.ll b/llvm/test/CodeGen/RISCV/bf16-promote.ll
index 08c053fab4f67..b3f04975d04c4 100644
--- a/llvm/test/CodeGen/RISCV/bf16-promote.ll
+++ b/llvm/test/CodeGen/RISCV/bf16-promote.ll
@@ -111,12 +111,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    lhu a0, 0(a1)
-; RV64-NEXT:    lhu a1, 0(s0)
-; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    lhu a0, 0(a0)
+; RV64-NEXT:    lhu a1, 0(a1)
 ; RV64-NEXT:    slli a1, a1, 16
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    fmv.w.x fa4, a1
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    fmv.w.x fa5, a1
+; RV64-NEXT:    fmv.w.x fa4, a0
 ; RV64-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64-NEXT:    call __truncsfbf2
 ; RV64-NEXT:    fmv.x.w a0, fa0
@@ -132,12 +132,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lhu a0, 0(a1)
-; RV32-NEXT:    lhu a1, 0(s0)
-; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    lhu a0, 0(a0)
+; RV32-NEXT:    lhu a1, 0(a1)
 ; RV32-NEXT:    slli a1, a1, 16
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    fmv.w.x fa4, a1
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    fmv.w.x fa5, a1
+; RV32-NEXT:    fmv.w.x fa4, a0
 ; RV32-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32-NEXT:    call __truncsfbf2
 ; RV32-NEXT:    fmv.x.w a0, fa0
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 82359769c7c22..8621b3e980a04 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -51,13 +51,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK32ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; CHECK32ZFBFMIN-NEXT:    lui a0, 815104
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32ZFBFMIN-NEXT:    neg a0, a1
+; CHECK32ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK32ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK32ZFBFMIN-NEXT:    neg a0, a0
+; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32ZFBFMIN-NEXT:    and a0, a0, a1
@@ -70,11 +70,11 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32ID-NEXT:    fmv.w.x fa3, a0
-; RV32ID-NEXT:    feq.s a0, fa3, fa3
-; RV32ID-NEXT:    fmax.s fa5, fa3, fa5
+; RV32ID-NEXT:    fmv.w.x fa4, a0
+; RV32ID-NEXT:    feq.s a0, fa4, fa4
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV32ID-NEXT:    neg a0, a0
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
@@ -83,13 +83,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK64ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK64ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; CHECK64ZFBFMIN-NEXT:    lui a0, 815104
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64ZFBFMIN-NEXT:    neg a0, a1
+; CHECK64ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK64ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK64ZFBFMIN-NEXT:    neg a0, a0
+; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64ZFBFMIN-NEXT:    and a0, a0, a1
@@ -102,11 +102,11 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64ID-NEXT:    fmv.w.x fa3, a0
-; RV64ID-NEXT:    feq.s a0, fa3, fa3
-; RV64ID-NEXT:    fmax.s fa5, fa3, fa5
+; RV64ID-NEXT:    fmv.w.x fa4, a0
+; RV64ID-NEXT:    feq.s a0, fa4, fa4
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV64ID-NEXT:    neg a0, a0
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
@@ -152,49 +152,49 @@ define i16 @fcvt_ui_bf16(bfloat %a) nounwind {
 define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_ui_bf16_sat:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
+; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK32ZFBFMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK32ZFBFMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; CHECK32ZFBFMIN-NEXT:    ret
 ;
 ; RV32ID-LABEL: fcvt_ui_bf16_sat:
 ; RV32ID:       # %bb.0: # %start
-; RV32ID-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
+; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
-; RV32ID-NEXT:    fmv.w.x fa3, zero
-; RV32ID-NEXT:    fmax.s fa4, fa4, fa3
-; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32ID-NEXT:    ret
 ;
 ; CHECK64ZFBFMIN-LABEL: fcvt_ui_bf16_sat:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
+; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK64ZFBFMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK64ZFBFMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK64ZFBFMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; CHECK64ZFBFMIN-NEXT:    ret
 ;
 ; RV64ID-LABEL: fcvt_ui_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
-; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
+; RV64ID-NEXT:    fmv.w.x fa5, zero
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
-; RV64ID-NEXT:    fmv.w.x fa3, zero
-; RV64ID-NEXT:    fmax.s fa4, fa4, fa3
-; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64ID-NEXT:    ret
 start:
@@ -647,14 +647,14 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK32ZFBFMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK32ZFBFMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa0, fa0
 ; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI12_0)
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa5, zero
+; CHECK32ZFBFMIN-NEXT:    fle.s a1, fa5, fa0
 ; CHECK32ZFBFMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa0, fa0
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32ZFBFMIN-NEXT:    fle.s a0, fa4, fa0
-; CHECK32ZFBFMIN-NEXT:    flt.s a1, fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    neg s0, a1
-; CHECK32ZFBFMIN-NEXT:    neg s1, a0
+; CHECK32ZFBFMIN-NEXT:    flt.s a0, fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    neg s0, a0
+; CHECK32ZFBFMIN-NEXT:    neg s1, a1
 ; CHECK32ZFBFMIN-NEXT:    call __fixunssfdi
 ; CHECK32ZFBFMIN-NEXT:    and a0, s1, a0
 ; CHECK32ZFBFMIN-NEXT:    and a1, s1, a1
@@ -675,11 +675,11 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI12_0)
 ; RV32ID-NEXT:    fmv.w.x fa5, zero
-; RV32ID-NEXT:    flw fa4, %lo(.LCPI12_0)(a1)
 ; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    fmv.w.x fa0, a0
 ; RV32ID-NEXT:    fle.s a0, fa5, fa0
-; RV32ID-NEXT:    flt.s a1, fa4, fa0
+; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
+; RV32ID-NEXT:    flt.s a1, fa5, fa0
 ; RV32ID-NEXT:    neg s0, a1
 ; RV32ID-NEXT:    neg s1, a0
 ; RV32ID-NEXT:    call __fixunssfdi
diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
index f9cf4e523b77d..504a698615841 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
@@ -7,11 +7,11 @@
 define bfloat @flh(ptr %a) nounwind {
 ; CHECK-LABEL: flh:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    flh fa5, 6(a0)
-; CHECK-NEXT:    flh fa4, 0(a0)
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT:    flh fa5, 0(a0)
+; CHECK-NEXT:    flh fa4, 6(a0)
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
-; CHECK-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %1 = load bfloat, ptr %a
diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll
index c83b0ed6b0eee..1b93fdbbb68c2 100644
--- a/llvm/test/CodeGen/RISCV/bfloat.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat.ll
@@ -447,12 +447,12 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32:       # %bb.0:
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ID-ILP32-NEXT:    lhu a1, 6(a0)
-; RV32ID-ILP32-NEXT:    lhu a0, 0(a0)
-; RV32ID-ILP32-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32-NEXT:    lhu a1, 0(a0)
+; RV32ID-ILP32-NEXT:    lhu a0, 6(a0)
 ; RV32ID-ILP32-NEXT:    slli a0, a0, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
+; RV32ID-ILP32-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
 ; RV32ID-ILP32-NEXT:    call __truncsfbf2
@@ -466,12 +466,12 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64:       # %bb.0:
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64-NEXT:    lhu a1, 6(a0)
-; RV64ID-LP64-NEXT:    lhu a0, 0(a0)
-; RV64ID-LP64-NEXT:    slli a1, a1, 16
+; RV64ID-LP64-NEXT:    lhu a1, 0(a0)
+; RV64ID-LP64-NEXT:    lhu a0, 6(a0)
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
-; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
+; RV64ID-LP64-NEXT:    slli a1, a1, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
 ; RV64ID-LP64-NEXT:    call __truncsfbf2
@@ -485,12 +485,12 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32D:       # %bb.0:
 ; RV32ID-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ID-ILP32D-NEXT:    lhu a1, 6(a0)
-; RV32ID-ILP32D-NEXT:    lhu a0, 0(a0)
-; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32D-NEXT:    lhu a1, 0(a0)
+; RV32ID-ILP32D-NEXT:    lhu a0, 6(a0)
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
+; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a0, fa0
@@ -505,12 +505,12 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64D:       # %bb.0:
 ; RV64ID-LP64D-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64D-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64D-NEXT:    lhu a1, 6(a0)
-; RV64ID-LP64D-NEXT:    lhu a0, 0(a0)
-; RV64ID-LP64D-NEXT:    slli a1, a1, 16
+; RV64ID-LP64D-NEXT:    lhu a1, 0(a0)
+; RV64ID-LP64D-NEXT:    lhu a0, 6(a0)
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
-; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
+; RV64ID-LP64D-NEXT:    slli a1, a1, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index d69ab0550a034..0564764c3f0bc 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -552,12 +552,12 @@ declare void @bar()
 define signext i32 @bit_10_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; CHECK-LABEL: bit_10_z_select_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a3, a0, 1024
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    beqz a3, .LBB15_2
+; CHECK-NEXT:    andi a0, a0, 1024
+; CHECK-NEXT:    beqz a0, .LBB15_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %1 = and i32 %a, 1024
   %2 = icmp eq i32 %1, 0
@@ -568,22 +568,22 @@ define signext i32 @bit_10_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
 define signext i32 @bit_10_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_10_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 21
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bltz a3, .LBB16_2
+; RV32-NEXT:    slli a0, a0, 21
+; RV32-NEXT:    bltz a0, .LBB16_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB16_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_10_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 53
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB16_2
+; RV64-NEXT:    slli a0, a0, 53
+; RV64-NEXT:    bltz a0, .LBB16_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB16_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 1024
   %2 = icmp ne i32 %1, 0
@@ -594,22 +594,22 @@ define signext i32 @bit_10_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig
 define signext i32 @bit_11_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_11_z_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 20
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bgez a3, .LBB17_2
+; RV32-NEXT:    slli a0, a0, 20
+; RV32-NEXT:    bgez a0, .LBB17_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB17_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_z_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 52
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB17_2
+; RV64-NEXT:    slli a0, a0, 52
+; RV64-NEXT:    bgez a0, .LBB17_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB17_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2048
   %2 = icmp eq i32 %1, 0
@@ -620,22 +620,22 @@ define signext i32 @bit_11_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
 define signext i32 @bit_11_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_11_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 20
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bltz a3, .LBB18_2
+; RV32-NEXT:    slli a0, a0, 20
+; RV32-NEXT:    bltz a0, .LBB18_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB18_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 52
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB18_2
+; RV64-NEXT:    slli a0, a0, 52
+; RV64-NEXT:    bltz a0, .LBB18_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB18_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2048
   %2 = icmp ne i32 %1, 0
@@ -646,22 +646,22 @@ define signext i32 @bit_11_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig
 define signext i32 @bit_20_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_20_z_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 11
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bgez a3, .LBB19_2
+; RV32-NEXT:    slli a0, a0, 11
+; RV32-NEXT:    bgez a0, .LBB19_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB19_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_z_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 43
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB19_2
+; RV64-NEXT:    slli a0, a0, 43
+; RV64-NEXT:    bgez a0, .LBB19_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB19_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 1048576
   %2 = icmp eq i32 %1, 0
@@ -672,22 +672,22 @@ define signext i32 @bit_20_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
 define signext i32 @bit_20_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_20_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 11
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bltz a3, .LBB20_2
+; RV32-NEXT:    slli a0, a0, 11
+; RV32-NEXT:    bltz a0, .LBB20_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB20_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 43
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB20_2
+; RV64-NEXT:    slli a0, a0, 43
+; RV64-NEXT:    bltz a0, .LBB20_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB20_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 1048576
   %2 = icmp ne i32 %1, 0
@@ -708,12 +708,12 @@ define signext i32 @bit_31_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
 ; RV64-LABEL: bit_31_z_select_i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a3, 524288
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB21_2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    beqz a0, .LBB21_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB21_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2147483648
   %2 = icmp eq i32 %1, 0
@@ -724,23 +724,23 @@ define signext i32 @bit_31_z_select_i32(i32 signext %a, i32 signext %b, i32 sign
 define signext i32 @bit_31_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_31_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    srli a3, a0, 31
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bnez a3, .LBB22_2
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    bnez a0, .LBB22_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB22_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_nz_select_i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a3, 524288
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB22_2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    bnez a0, .LBB22_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB22_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2147483648
   %2 = icmp ne i32 %1, 0
@@ -752,23 +752,23 @@ define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    andi a3, a0, 1024
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB23_2
+; RV32-NEXT:    andi a0, a0, 1024
+; RV32-NEXT:    beqz a0, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB23_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_10_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a3, a0, 1024
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB23_2
+; RV64-NEXT:    andi a0, a0, 1024
+; RV64-NEXT:    beqz a0, .LBB23_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB23_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1024
   %2 = icmp eq i64 %1, 0
@@ -781,47 +781,47 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    slli a0, a0, 21
-; RV32I-NEXT:    srli a3, a0, 31
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a3, .LBB24_2
+; RV32I-NEXT:    srli a0, a0, 31
+; RV32I-NEXT:    bnez a0, .LBB24_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_10_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 53
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB24_2
+; RV64-NEXT:    slli a0, a0, 53
+; RV64-NEXT:    bltz a0, .LBB24_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB24_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_10_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    mv a1, a3
-; RV32ZBS-NEXT:    bexti a3, a0, 10
-; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a3, .LBB24_2
+; RV32ZBS-NEXT:    bexti a0, a0, 10
+; RV32ZBS-NEXT:    bnez a0, .LBB24_2
 ; RV32ZBS-NEXT:  # %bb.1:
-; RV32ZBS-NEXT:    mv a0, a4
+; RV32ZBS-NEXT:    mv a2, a4
 ; RV32ZBS-NEXT:    mv a1, a5
 ; RV32ZBS-NEXT:  .LBB24_2:
+; RV32ZBS-NEXT:    mv a0, a2
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_10_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    mv a1, a3
-; RV32XTHEADBS-NEXT:    th.tst a3, a0, 10
-; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a3, .LBB24_2
+; RV32XTHEADBS-NEXT:    th.tst a0, a0, 10
+; RV32XTHEADBS-NEXT:    bnez a0, .LBB24_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
-; RV32XTHEADBS-NEXT:    mv a0, a4
+; RV32XTHEADBS-NEXT:    mv a2, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
 ; RV32XTHEADBS-NEXT:  .LBB24_2:
+; RV32XTHEADBS-NEXT:    mv a0, a2
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 1024
   %2 = icmp ne i64 %1, 0
@@ -833,23 +833,23 @@ define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 20
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bgez a3, .LBB25_2
+; RV32-NEXT:    slli a0, a0, 20
+; RV32-NEXT:    bgez a0, .LBB25_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB25_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 52
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB25_2
+; RV64-NEXT:    slli a0, a0, 52
+; RV64-NEXT:    bgez a0, .LBB25_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB25_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2048
   %2 = icmp eq i64 %1, 0
@@ -862,47 +862,47 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    slli a0, a0, 20
-; RV32I-NEXT:    srli a3, a0, 31
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a3, .LBB26_2
+; RV32I-NEXT:    srli a0, a0, 31
+; RV32I-NEXT:    bnez a0, .LBB26_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:  .LBB26_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 52
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB26_2
+; RV64-NEXT:    slli a0, a0, 52
+; RV64-NEXT:    bltz a0, .LBB26_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB26_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_11_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    mv a1, a3
-; RV32ZBS-NEXT:    bexti a3, a0, 11
-; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a3, .LBB26_2
+; RV32ZBS-NEXT:    bexti a0, a0, 11
+; RV32ZBS-NEXT:    bnez a0, .LBB26_2
 ; RV32ZBS-NEXT:  # %bb.1:
-; RV32ZBS-NEXT:    mv a0, a4
+; RV32ZBS-NEXT:    mv a2, a4
 ; RV32ZBS-NEXT:    mv a1, a5
 ; RV32ZBS-NEXT:  .LBB26_2:
+; RV32ZBS-NEXT:    mv a0, a2
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_11_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    mv a1, a3
-; RV32XTHEADBS-NEXT:    th.tst a3, a0, 11
-; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a3, .LBB26_2
+; RV32XTHEADBS-NEXT:    th.tst a0, a0, 11
+; RV32XTHEADBS-NEXT:    bnez a0, .LBB26_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
-; RV32XTHEADBS-NEXT:    mv a0, a4
+; RV32XTHEADBS-NEXT:    mv a2, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
 ; RV32XTHEADBS-NEXT:  .LBB26_2:
+; RV32XTHEADBS-NEXT:    mv a0, a2
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 2048
   %2 = icmp ne i64 %1, 0
@@ -914,23 +914,23 @@ define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 11
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bgez a3, .LBB27_2
+; RV32-NEXT:    slli a0, a0, 11
+; RV32-NEXT:    bgez a0, .LBB27_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB27_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 43
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB27_2
+; RV64-NEXT:    slli a0, a0, 43
+; RV64-NEXT:    bgez a0, .LBB27_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB27_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1048576
   %2 = icmp eq i64 %1, 0
@@ -943,47 +943,47 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    slli a0, a0, 11
-; RV32I-NEXT:    srli a3, a0, 31
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a3, .LBB28_2
+; RV32I-NEXT:    srli a0, a0, 31
+; RV32I-NEXT:    bnez a0, .LBB28_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:  .LBB28_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 43
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB28_2
+; RV64-NEXT:    slli a0, a0, 43
+; RV64-NEXT:    bltz a0, .LBB28_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB28_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_20_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    mv a1, a3
-; RV32ZBS-NEXT:    bexti a3, a0, 20
-; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a3, .LBB28_2
+; RV32ZBS-NEXT:    bexti a0, a0, 20
+; RV32ZBS-NEXT:    bnez a0, .LBB28_2
 ; RV32ZBS-NEXT:  # %bb.1:
-; RV32ZBS-NEXT:    mv a0, a4
+; RV32ZBS-NEXT:    mv a2, a4
 ; RV32ZBS-NEXT:    mv a1, a5
 ; RV32ZBS-NEXT:  .LBB28_2:
+; RV32ZBS-NEXT:    mv a0, a2
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_20_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    mv a1, a3
-; RV32XTHEADBS-NEXT:    th.tst a3, a0, 20
-; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a3, .LBB28_2
+; RV32XTHEADBS-NEXT:    th.tst a0, a0, 20
+; RV32XTHEADBS-NEXT:    bnez a0, .LBB28_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
-; RV32XTHEADBS-NEXT:    mv a0, a4
+; RV32XTHEADBS-NEXT:    mv a2, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
 ; RV32XTHEADBS-NEXT:  .LBB28_2:
+; RV32XTHEADBS-NEXT:    mv a0, a2
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 1048576
   %2 = icmp ne i64 %1, 0
@@ -1005,12 +1005,12 @@ define i64 @bit_31_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_31_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 32
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB29_2
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    bgez a0, .LBB29_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB29_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2147483648
   %2 = icmp eq i64 %1, 0
@@ -1022,23 +1022,23 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_nz_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    srli a3, a0, 31
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a3, .LBB30_2
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    bnez a0, .LBB30_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB30_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 32
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB30_2
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    bltz a0, .LBB30_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB30_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2147483648
   %2 = icmp ne i64 %1, 0
@@ -1049,8 +1049,8 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_32_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    beqz a1, .LBB31_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
@@ -1061,12 +1061,12 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_32_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 31
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB31_2
+; RV64-NEXT:    slli a0, a0, 31
+; RV64-NEXT:    bgez a0, .LBB31_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB31_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 4294967296
   %2 = icmp eq i64 %1, 0
@@ -1077,8 +1077,8 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_32_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    bnez a1, .LBB32_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
@@ -1089,12 +1089,12 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_32_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 31
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB32_2
+; RV64-NEXT:    slli a0, a0, 31
+; RV64-NEXT:    bltz a0, .LBB32_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB32_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 4294967296
   %2 = icmp ne i64 %1, 0
@@ -1105,8 +1105,8 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_55_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    bgez a1, .LBB33_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
@@ -1117,12 +1117,12 @@ define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_55_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 8
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB33_2
+; RV64-NEXT:    slli a0, a0, 8
+; RV64-NEXT:    bgez a0, .LBB33_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB33_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 36028797018963968
   %2 = icmp eq i64 %1, 0
@@ -1133,9 +1133,9 @@ define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I-LABEL: bit_55_nz_select_i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    srli a1, a1, 31
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    bnez a1, .LBB34_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
@@ -1146,18 +1146,18 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_55_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 8
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB34_2
+; RV64-NEXT:    slli a0, a0, 8
+; RV64-NEXT:    bltz a0, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB34_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_55_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
-; RV32ZBS-NEXT:    bexti a1, a1, 23
 ; RV32ZBS-NEXT:    mv a0, a2
+; RV32ZBS-NEXT:    bexti a1, a1, 23
 ; RV32ZBS-NEXT:    bnez a1, .LBB34_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
@@ -1168,8 +1168,8 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32XTHEADBS-LABEL: bit_55_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
-; RV32XTHEADBS-NEXT:    th.tst a1, a1, 23
 ; RV32XTHEADBS-NEXT:    mv a0, a2
+; RV32XTHEADBS-NEXT:    th.tst a1, a1, 23
 ; RV32XTHEADBS-NEXT:    bnez a1, .LBB34_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
@@ -1212,8 +1212,8 @@ define i64 @bit_63_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_63_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    bnez a1, .LBB36_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
@@ -1224,12 +1224,12 @@ define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_63_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srli a3, a0, 63
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB36_2
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    bnez a0, .LBB36_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB36_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 9223372036854775808
   %2 = icmp ne i64 %1, 0
@@ -1858,12 +1858,12 @@ define void @bit_63_nz_branch_i64(i64 %0) {
 define signext i32 @bit_10_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; CHECK-LABEL: bit_10_1_z_select_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a3, a0, 1023
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    beqz a3, .LBB59_2
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    beqz a0, .LBB59_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB59_2:
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %1 = and i32 %a, 1023
   %2 = icmp eq i32 %1, 0
@@ -1874,12 +1874,12 @@ define signext i32 @bit_10_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si
 define signext i32 @bit_10_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; CHECK-LABEL: bit_10_1_nz_select_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a3, a0, 1023
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    bnez a3, .LBB60_2
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    bnez a0, .LBB60_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB60_2:
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %1 = and i32 %a, 1023
   %2 = icmp ne i32 %1, 0
@@ -1890,12 +1890,12 @@ define signext i32 @bit_10_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s
 define signext i32 @bit_11_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; CHECK-LABEL: bit_11_1_z_select_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a3, a0, 2047
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    beqz a3, .LBB61_2
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    beqz a0, .LBB61_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB61_2:
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %1 = and i32 %a, 2047
   %2 = icmp eq i32 %1, 0
@@ -1906,12 +1906,12 @@ define signext i32 @bit_11_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si
 define signext i32 @bit_11_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; CHECK-LABEL: bit_11_1_nz_select_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a3, a0, 2047
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    bnez a3, .LBB62_2
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    bnez a0, .LBB62_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB62_2:
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %1 = and i32 %a, 2047
   %2 = icmp ne i32 %1, 0
@@ -1922,22 +1922,22 @@ define signext i32 @bit_11_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s
 define signext i32 @bit_16_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_16_1_z_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 16
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beqz a3, .LBB63_2
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    beqz a0, .LBB63_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB63_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_16_1_z_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 48
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB63_2
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    beqz a0, .LBB63_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB63_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 65535
   %2 = icmp eq i32 %1, 0
@@ -1948,22 +1948,22 @@ define signext i32 @bit_16_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si
 define signext i32 @bit_16_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_16_1_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 16
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bnez a3, .LBB64_2
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    bnez a0, .LBB64_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB64_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_16_1_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 48
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB64_2
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    bnez a0, .LBB64_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB64_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 65535
   %2 = icmp ne i32 %1, 0
@@ -1974,22 +1974,22 @@ define signext i32 @bit_16_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s
 define signext i32 @bit_20_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_20_1_z_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 12
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beqz a3, .LBB65_2
+; RV32-NEXT:    slli a0, a0, 12
+; RV32-NEXT:    beqz a0, .LBB65_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB65_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_1_z_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 44
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB65_2
+; RV64-NEXT:    slli a0, a0, 44
+; RV64-NEXT:    beqz a0, .LBB65_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB65_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 1048575
   %2 = icmp eq i32 %1, 0
@@ -2000,22 +2000,22 @@ define signext i32 @bit_20_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si
 define signext i32 @bit_20_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_20_1_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 12
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bnez a3, .LBB66_2
+; RV32-NEXT:    slli a0, a0, 12
+; RV32-NEXT:    bnez a0, .LBB66_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB66_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_1_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 44
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB66_2
+; RV64-NEXT:    slli a0, a0, 44
+; RV64-NEXT:    bnez a0, .LBB66_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB66_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 1048575
   %2 = icmp ne i32 %1, 0
@@ -2026,22 +2026,22 @@ define signext i32 @bit_20_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s
 define signext i32 @bit_31_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_31_1_z_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beqz a3, .LBB67_2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB67_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB67_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_1_z_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 33
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB67_2
+; RV64-NEXT:    slli a0, a0, 33
+; RV64-NEXT:    beqz a0, .LBB67_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB67_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2147483647
   %2 = icmp eq i32 %1, 0
@@ -2052,22 +2052,22 @@ define signext i32 @bit_31_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si
 define signext i32 @bit_31_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
 ; RV32-LABEL: bit_31_1_nz_select_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    bnez a3, .LBB68_2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    bnez a0, .LBB68_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:  .LBB68_2:
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_1_nz_select_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 33
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB68_2
+; RV64-NEXT:    slli a0, a0, 33
+; RV64-NEXT:    bnez a0, .LBB68_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB68_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i32 %a, 2147483647
   %2 = icmp ne i32 %1, 0
@@ -2109,23 +2109,23 @@ define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_1_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    andi a3, a0, 1023
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB71_2
+; RV32-NEXT:    andi a0, a0, 1023
+; RV32-NEXT:    beqz a0, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB71_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_10_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a3, a0, 1023
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB71_2
+; RV64-NEXT:    andi a0, a0, 1023
+; RV64-NEXT:    beqz a0, .LBB71_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB71_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1023
   %2 = icmp eq i64 %1, 0
@@ -2137,23 +2137,23 @@ define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_1_nz_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    andi a3, a0, 1023
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a3, .LBB72_2
+; RV32-NEXT:    andi a0, a0, 1023
+; RV32-NEXT:    bnez a0, .LBB72_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB72_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_10_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a3, a0, 1023
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB72_2
+; RV64-NEXT:    andi a0, a0, 1023
+; RV64-NEXT:    bnez a0, .LBB72_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB72_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1023
   %2 = icmp ne i64 %1, 0
@@ -2165,23 +2165,23 @@ define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_1_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    andi a3, a0, 2047
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB73_2
+; RV32-NEXT:    andi a0, a0, 2047
+; RV32-NEXT:    beqz a0, .LBB73_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB73_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a3, a0, 2047
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB73_2
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    beqz a0, .LBB73_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB73_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2047
   %2 = icmp eq i64 %1, 0
@@ -2193,23 +2193,23 @@ define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_1_nz_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    andi a3, a0, 2047
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a3, .LBB74_2
+; RV32-NEXT:    andi a0, a0, 2047
+; RV32-NEXT:    bnez a0, .LBB74_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB74_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_11_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a3, a0, 2047
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB74_2
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    bnez a0, .LBB74_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB74_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2047
   %2 = icmp ne i64 %1, 0
@@ -2221,23 +2221,23 @@ define i64 @bit_16_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_16_1_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 16
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB75_2
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    beqz a0, .LBB75_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB75_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_16_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 48
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB75_2
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    beqz a0, .LBB75_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB75_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 65535
   %2 = icmp eq i64 %1, 0
@@ -2259,12 +2259,12 @@ define i64 @bit_16_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_16_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB76_2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bnez a0, .LBB76_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB76_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 4294967295
   %2 = icmp ne i64 %1, 0
@@ -2277,23 +2277,23 @@ define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_1_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 12
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB77_2
+; RV32-NEXT:    slli a0, a0, 12
+; RV32-NEXT:    beqz a0, .LBB77_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB77_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 44
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB77_2
+; RV64-NEXT:    slli a0, a0, 44
+; RV64-NEXT:    beqz a0, .LBB77_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB77_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1048575
   %2 = icmp eq i64 %1, 0
@@ -2305,23 +2305,23 @@ define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_1_nz_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 12
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a3, .LBB78_2
+; RV32-NEXT:    slli a0, a0, 12
+; RV32-NEXT:    bnez a0, .LBB78_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB78_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_20_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 44
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB78_2
+; RV64-NEXT:    slli a0, a0, 44
+; RV64-NEXT:    bnez a0, .LBB78_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB78_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 1048575
   %2 = icmp ne i64 %1, 0
@@ -2333,23 +2333,23 @@ define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_1_z_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a3, .LBB79_2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB79_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB79_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 33
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB79_2
+; RV64-NEXT:    slli a0, a0, 33
+; RV64-NEXT:    beqz a0, .LBB79_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB79_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2147483647
   %2 = icmp eq i64 %1, 0
@@ -2361,23 +2361,23 @@ define i64 @bit_31_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_1_nz_select_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a3, .LBB80_2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    bnez a0, .LBB80_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a1, a5
 ; RV32-NEXT:  .LBB80_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_31_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 33
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB80_2
+; RV64-NEXT:    slli a0, a0, 33
+; RV64-NEXT:    bnez a0, .LBB80_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB80_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 2147483647
   %2 = icmp ne i64 %1, 0
@@ -2399,12 +2399,12 @@ define i64 @bit_32_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_32_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB81_2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    beqz a0, .LBB81_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB81_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 4294967295
   %2 = icmp eq i64 %1, 0
@@ -2426,12 +2426,12 @@ define i64 @bit_32_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV64-LABEL: bit_32_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB82_2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bnez a0, .LBB82_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB82_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 4294967295
   %2 = icmp ne i64 %1, 0
@@ -2444,24 +2444,24 @@ define i64 @bit_55_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a1, 9
 ; RV32-NEXT:    srli a1, a1, 9
-; RV32-NEXT:    or a1, a0, a1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a1, .LBB83_2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    beqz a0, .LBB83_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB83_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 9
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB83_2
+; RV64-NEXT:    slli a0, a0, 9
+; RV64-NEXT:    beqz a0, .LBB83_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB83_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 36028797018963967
   %2 = icmp eq i64 %1, 0
@@ -2474,24 +2474,24 @@ define i64 @bit_55_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a1, 9
 ; RV32-NEXT:    srli a1, a1, 9
-; RV32-NEXT:    or a1, a0, a1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a1, .LBB84_2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    bnez a0, .LBB84_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB84_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 9
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB84_2
+; RV64-NEXT:    slli a0, a0, 9
+; RV64-NEXT:    bnez a0, .LBB84_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB84_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %1 = and i64 %a, 36028797018963967
   %2 = icmp ne i64 %1, 0
@@ -2504,36 +2504,36 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a1, .LBB85_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB85_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB85_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_63_1_z_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 1
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    beqz a3, .LBB85_2
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    beqz a0, .LBB85_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB85_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_63_1_z_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    bclri a1, a1, 31
-; RV32ZBS-NEXT:    or a1, a0, a1
-; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    beqz a1, .LBB85_2
+; RV32ZBS-NEXT:    or a0, a0, a1
+; RV32ZBS-NEXT:    beqz a0, .LBB85_2
 ; RV32ZBS-NEXT:  # %bb.1:
-; RV32ZBS-NEXT:    mv a0, a4
+; RV32ZBS-NEXT:    mv a2, a4
 ; RV32ZBS-NEXT:    mv a3, a5
 ; RV32ZBS-NEXT:  .LBB85_2:
+; RV32ZBS-NEXT:    mv a0, a2
 ; RV32ZBS-NEXT:    mv a1, a3
 ; RV32ZBS-NEXT:    ret
 ;
@@ -2541,13 +2541,13 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBS-NEXT:    srli a1, a1, 1
-; RV32XTHEADBS-NEXT:    or a1, a0, a1
-; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    beqz a1, .LBB85_2
+; RV32XTHEADBS-NEXT:    or a0, a0, a1
+; RV32XTHEADBS-NEXT:    beqz a0, .LBB85_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
-; RV32XTHEADBS-NEXT:    mv a0, a4
+; RV32XTHEADBS-NEXT:    mv a2, a4
 ; RV32XTHEADBS-NEXT:    mv a3, a5
 ; RV32XTHEADBS-NEXT:  .LBB85_2:
+; RV32XTHEADBS-NEXT:    mv a0, a2
 ; RV32XTHEADBS-NEXT:    mv a1, a3
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 9223372036854775807
@@ -2561,36 +2561,36 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB86_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB86_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB86_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_63_1_nz_select_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a3, a0, 1
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bnez a3, .LBB86_2
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    bnez a0, .LBB86_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB86_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bit_63_1_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    bclri a1, a1, 31
-; RV32ZBS-NEXT:    or a1, a0, a1
-; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a1, .LBB86_2
+; RV32ZBS-NEXT:    or a0, a0, a1
+; RV32ZBS-NEXT:    bnez a0, .LBB86_2
 ; RV32ZBS-NEXT:  # %bb.1:
-; RV32ZBS-NEXT:    mv a0, a4
+; RV32ZBS-NEXT:    mv a2, a4
 ; RV32ZBS-NEXT:    mv a3, a5
 ; RV32ZBS-NEXT:  .LBB86_2:
+; RV32ZBS-NEXT:    mv a0, a2
 ; RV32ZBS-NEXT:    mv a1, a3
 ; RV32ZBS-NEXT:    ret
 ;
@@ -2598,13 +2598,13 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBS-NEXT:    srli a1, a1, 1
-; RV32XTHEADBS-NEXT:    or a1, a0, a1
-; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a1, .LBB86_2
+; RV32XTHEADBS-NEXT:    or a0, a0, a1
+; RV32XTHEADBS-NEXT:    bnez a0, .LBB86_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
-; RV32XTHEADBS-NEXT:    mv a0, a4
+; RV32XTHEADBS-NEXT:    mv a2, a4
 ; RV32XTHEADBS-NEXT:    mv a3, a5
 ; RV32XTHEADBS-NEXT:  .LBB86_2:
+; RV32XTHEADBS-NEXT:    mv a0, a2
 ; RV32XTHEADBS-NEXT:    mv a1, a3
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 9223372036854775807
@@ -2616,13 +2616,13 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_64_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    or a1, a0, a1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a1, .LBB87_2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    beqz a0, .LBB87_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB87_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
@@ -2643,13 +2643,13 @@ define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_64_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_64_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    or a1, a0, a1
-; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a1, .LBB88_2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    bnez a0, .LBB88_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB88_2:
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb3775..de325010bb281 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -76,8 +76,8 @@ define i32 @test_lshr(i32 %v) {
 ; RV32-NEXT:  .LBB2_1: # %for.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    andi a2, a0, 1
-; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    bnez a0, .LBB2_1
 ; RV32-NEXT:  .LBB2_2: # %for.end
 ; RV32-NEXT:    mv a0, a1
@@ -92,8 +92,8 @@ define i32 @test_lshr(i32 %v) {
 ; RV64-NEXT:  .LBB2_2: # %for.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    andi a2, a0, 1
-; RV64-NEXT:    srliw a0, a0, 1
 ; RV64-NEXT:    addw a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 1
 ; RV64-NEXT:    bnez a0, .LBB2_2
 ; RV64-NEXT:  .LBB2_3: # %for.end
 ; RV64-NEXT:    mv a0, a1
@@ -129,9 +129,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV32-NEXT:    lw a3, 0(a1)
 ; RV32-NEXT:    addi a4, a1, 4
 ; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    addi a0, a0, 4
 ; RV32-NEXT:    mv a1, a4
 ; RV32-NEXT:    bne a4, a2, .LBB3_2
 ; RV32-NEXT:  .LBB3_3: # %while.end
@@ -153,9 +152,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV64-NEXT:    lw a3, 0(a1)
 ; RV64-NEXT:    addi a4, a1, 4
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    sw a3, 0(a0)
-; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    addi a0, a0, 4
 ; RV64-NEXT:    mv a1, a4
 ; RV64-NEXT:    bne a4, a2, .LBB3_2
 ; RV64-NEXT:  .LBB3_3: # %while.end
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 337e9bc5845f9..88ad8e6930287 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -53,22 +53,22 @@ define void @callee() nounwind {
 ; ILP32-NEXT:    flw fs1, 84(a1)
 ; ILP32-NEXT:    flw fs2, 88(a1)
 ; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 112(a1)
-; ILP32-NEXT:    flw fs5, 116(a1)
-; ILP32-NEXT:    flw fs6, 120(a1)
-; ILP32-NEXT:    flw fs7, 124(a1)
-; ILP32-NEXT:    flw fs8, 96(a1)
-; ILP32-NEXT:    flw fs9, 100(a1)
-; ILP32-NEXT:    flw fs10, 104(a1)
-; ILP32-NEXT:    flw fs11, 108(a1)
-; ILP32-NEXT:    fsw fs7, 124(a1)
-; ILP32-NEXT:    fsw fs6, 120(a1)
-; ILP32-NEXT:    fsw fs5, 116(a1)
-; ILP32-NEXT:    fsw fs4, 112(a1)
-; ILP32-NEXT:    fsw fs11, 108(a1)
-; ILP32-NEXT:    fsw fs10, 104(a1)
-; ILP32-NEXT:    fsw fs9, 100(a1)
-; ILP32-NEXT:    fsw fs8, 96(a1)
+; ILP32-NEXT:    flw fs4, 96(a1)
+; ILP32-NEXT:    flw fs5, 100(a1)
+; ILP32-NEXT:    flw fs6, 104(a1)
+; ILP32-NEXT:    flw fs7, 108(a1)
+; ILP32-NEXT:    flw fs8, 112(a1)
+; ILP32-NEXT:    flw fs9, 116(a1)
+; ILP32-NEXT:    flw fs10, 120(a1)
+; ILP32-NEXT:    flw fs11, 124(a1)
+; ILP32-NEXT:    fsw fs11, 124(a1)
+; ILP32-NEXT:    fsw fs10, 120(a1)
+; ILP32-NEXT:    fsw fs9, 116(a1)
+; ILP32-NEXT:    fsw fs8, 112(a1)
+; ILP32-NEXT:    fsw fs7, 108(a1)
+; ILP32-NEXT:    fsw fs6, 104(a1)
+; ILP32-NEXT:    fsw fs5, 100(a1)
+; ILP32-NEXT:    fsw fs4, 96(a1)
 ; ILP32-NEXT:    fsw fs3, 92(a1)
 ; ILP32-NEXT:    fsw fs2, 88(a1)
 ; ILP32-NEXT:    fsw fs1, 84(a1)
@@ -123,22 +123,22 @@ define void @callee() nounwind {
 ; ILP32E-NEXT:    flw fs1, 84(a1)
 ; ILP32E-NEXT:    flw fs2, 88(a1)
 ; ILP32E-NEXT:    flw fs3, 92(a1)
-; ILP32E-NEXT:    flw fs4, 112(a1)
-; ILP32E-NEXT:    flw fs5, 116(a1)
-; ILP32E-NEXT:    flw fs6, 120(a1)
-; ILP32E-NEXT:    flw fs7, 124(a1)
-; ILP32E-NEXT:    flw fs8, 96(a1)
-; ILP32E-NEXT:    flw fs9, 100(a1)
-; ILP32E-NEXT:    flw fs10, 104(a1)
-; ILP32E-NEXT:    flw fs11, 108(a1)
-; ILP32E-NEXT:    fsw fs7, 124(a1)
-; ILP32E-NEXT:    fsw fs6, 120(a1)
-; ILP32E-NEXT:    fsw fs5, 116(a1)
-; ILP32E-NEXT:    fsw fs4, 112(a1)
-; ILP32E-NEXT:    fsw fs11, 108(a1)
-; ILP32E-NEXT:    fsw fs10, 104(a1)
-; ILP32E-NEXT:    fsw fs9, 100(a1)
-; ILP32E-NEXT:    fsw fs8, 96(a1)
+; ILP32E-NEXT:    flw fs4, 96(a1)
+; ILP32E-NEXT:    flw fs5, 100(a1)
+; ILP32E-NEXT:    flw fs6, 104(a1)
+; ILP32E-NEXT:    flw fs7, 108(a1)
+; ILP32E-NEXT:    flw fs8, 112(a1)
+; ILP32E-NEXT:    flw fs9, 116(a1)
+; ILP32E-NEXT:    flw fs10, 120(a1)
+; ILP32E-NEXT:    flw fs11, 124(a1)
+; ILP32E-NEXT:    fsw fs11, 124(a1)
+; ILP32E-NEXT:    fsw fs10, 120(a1)
+; ILP32E-NEXT:    fsw fs9, 116(a1)
+; ILP32E-NEXT:    fsw fs8, 112(a1)
+; ILP32E-NEXT:    fsw fs7, 108(a1)
+; ILP32E-NEXT:    fsw fs6, 104(a1)
+; ILP32E-NEXT:    fsw fs5, 100(a1)
+; ILP32E-NEXT:    fsw fs4, 96(a1)
 ; ILP32E-NEXT:    fsw fs3, 92(a1)
 ; ILP32E-NEXT:    fsw fs2, 88(a1)
 ; ILP32E-NEXT:    fsw fs1, 84(a1)
@@ -193,22 +193,22 @@ define void @callee() nounwind {
 ; LP64-NEXT:    flw fs1, 84(a1)
 ; LP64-NEXT:    flw fs2, 88(a1)
 ; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 112(a1)
-; LP64-NEXT:    flw fs5, 116(a1)
-; LP64-NEXT:    flw fs6, 120(a1)
-; LP64-NEXT:    flw fs7, 124(a1)
-; LP64-NEXT:    flw fs8, 96(a1)
-; LP64-NEXT:    flw fs9, 100(a1)
-; LP64-NEXT:    flw fs10, 104(a1)
-; LP64-NEXT:    flw fs11, 108(a1)
-; LP64-NEXT:    fsw fs7, 124(a1)
-; LP64-NEXT:    fsw fs6, 120(a1)
-; LP64-NEXT:    fsw fs5, 116(a1)
-; LP64-NEXT:    fsw fs4, 112(a1)
-; LP64-NEXT:    fsw fs11, 108(a1)
-; LP64-NEXT:    fsw fs10, 104(a1)
-; LP64-NEXT:    fsw fs9, 100(a1)
-; LP64-NEXT:    fsw fs8, 96(a1)
+; LP64-NEXT:    flw fs4, 96(a1)
+; LP64-NEXT:    flw fs5, 100(a1)
+; LP64-NEXT:    flw fs6, 104(a1)
+; LP64-NEXT:    flw fs7, 108(a1)
+; LP64-NEXT:    flw fs8, 112(a1)
+; LP64-NEXT:    flw fs9, 116(a1)
+; LP64-NEXT:    flw fs10, 120(a1)
+; LP64-NEXT:    flw fs11, 124(a1)
+; LP64-NEXT:    fsw fs11, 124(a1)
+; LP64-NEXT:    fsw fs10, 120(a1)
+; LP64-NEXT:    fsw fs9, 116(a1)
+; LP64-NEXT:    fsw fs8, 112(a1)
+; LP64-NEXT:    fsw fs7, 108(a1)
+; LP64-NEXT:    fsw fs6, 104(a1)
+; LP64-NEXT:    fsw fs5, 100(a1)
+; LP64-NEXT:    fsw fs4, 96(a1)
 ; LP64-NEXT:    fsw fs3, 92(a1)
 ; LP64-NEXT:    fsw fs2, 88(a1)
 ; LP64-NEXT:    fsw fs1, 84(a1)
@@ -263,22 +263,22 @@ define void @callee() nounwind {
 ; LP64E-NEXT:    flw fs1, 84(a1)
 ; LP64E-NEXT:    flw fs2, 88(a1)
 ; LP64E-NEXT:    flw fs3, 92(a1)
-; LP64E-NEXT:    flw fs4, 112(a1)
-; LP64E-NEXT:    flw fs5, 116(a1)
-; LP64E-NEXT:    flw fs6, 120(a1)
-; LP64E-NEXT:    flw fs7, 124(a1)
-; LP64E-NEXT:    flw fs8, 96(a1)
-; LP64E-NEXT:    flw fs9, 100(a1)
-; LP64E-NEXT:    flw fs10, 104(a1)
-; LP64E-NEXT:    flw fs11, 108(a1)
-; LP64E-NEXT:    fsw fs7, 124(a1)
-; LP64E-NEXT:    fsw fs6, 120(a1)
-; LP64E-NEXT:    fsw fs5, 116(a1)
-; LP64E-NEXT:    fsw fs4, 112(a1)
-; LP64E-NEXT:    fsw fs11, 108(a1)
-; LP64E-NEXT:    fsw fs10, 104(a1)
-; LP64E-NEXT:    fsw fs9, 100(a1)
-; LP64E-NEXT:    fsw fs8, 96(a1)
+; LP64E-NEXT:    flw fs4, 96(a1)
+; LP64E-NEXT:    flw fs5, 100(a1)
+; LP64E-NEXT:    flw fs6, 104(a1)
+; LP64E-NEXT:    flw fs7, 108(a1)
+; LP64E-NEXT:    flw fs8, 112(a1)
+; LP64E-NEXT:    flw fs9, 116(a1)
+; LP64E-NEXT:    flw fs10, 120(a1)
+; LP64E-NEXT:    flw fs11, 124(a1)
+; LP64E-NEXT:    fsw fs11, 124(a1)
+; LP64E-NEXT:    fsw fs10, 120(a1)
+; LP64E-NEXT:    fsw fs9, 116(a1)
+; LP64E-NEXT:    fsw fs8, 112(a1)
+; LP64E-NEXT:    fsw fs7, 108(a1)
+; LP64E-NEXT:    fsw fs6, 104(a1)
+; LP64E-NEXT:    fsw fs5, 100(a1)
+; LP64E-NEXT:    fsw fs4, 96(a1)
 ; LP64E-NEXT:    fsw fs3, 92(a1)
 ; LP64E-NEXT:    fsw fs2, 88(a1)
 ; LP64E-NEXT:    fsw fs1, 84(a1)
@@ -346,22 +346,22 @@ define void @callee() nounwind {
 ; ILP32F-NEXT:    flw fs1, 84(a1)
 ; ILP32F-NEXT:    flw fs2, 88(a1)
 ; ILP32F-NEXT:    flw fs3, 92(a1)
-; ILP32F-NEXT:    flw fs4, 112(a1)
-; ILP32F-NEXT:    flw fs5, 116(a1)
-; ILP32F-NEXT:    flw fs6, 120(a1)
-; ILP32F-NEXT:    flw fs7, 124(a1)
-; ILP32F-NEXT:    flw fs8, 96(a1)
-; ILP32F-NEXT:    flw fs9, 100(a1)
-; ILP32F-NEXT:    flw fs10, 104(a1)
-; ILP32F-NEXT:    flw fs11, 108(a1)
-; ILP32F-NEXT:    fsw fs7, 124(a1)
-; ILP32F-NEXT:    fsw fs6, 120(a1)
-; ILP32F-NEXT:    fsw fs5, 116(a1)
-; ILP32F-NEXT:    fsw fs4, 112(a1)
-; ILP32F-NEXT:    fsw fs11, 108(a1)
-; ILP32F-NEXT:    fsw fs10, 104(a1)
-; ILP32F-NEXT:    fsw fs9, 100(a1)
-; ILP32F-NEXT:    fsw fs8, 96(a1)
+; ILP32F-NEXT:    flw fs4, 96(a1)
+; ILP32F-NEXT:    flw fs5, 100(a1)
+; ILP32F-NEXT:    flw fs6, 104(a1)
+; ILP32F-NEXT:    flw fs7, 108(a1)
+; ILP32F-NEXT:    flw fs8, 112(a1)
+; ILP32F-NEXT:    flw fs9, 116(a1)
+; ILP32F-NEXT:    flw fs10, 120(a1)
+; ILP32F-NEXT:    flw fs11, 124(a1)
+; ILP32F-NEXT:    fsw fs11, 124(a1)
+; ILP32F-NEXT:    fsw fs10, 120(a1)
+; ILP32F-NEXT:    fsw fs9, 116(a1)
+; ILP32F-NEXT:    fsw fs8, 112(a1)
+; ILP32F-NEXT:    fsw fs7, 108(a1)
+; ILP32F-NEXT:    fsw fs6, 104(a1)
+; ILP32F-NEXT:    fsw fs5, 100(a1)
+; ILP32F-NEXT:    fsw fs4, 96(a1)
 ; ILP32F-NEXT:    fsw fs3, 92(a1)
 ; ILP32F-NEXT:    fsw fs2, 88(a1)
 ; ILP32F-NEXT:    fsw fs1, 84(a1)
@@ -442,22 +442,22 @@ define void @callee() nounwind {
 ; LP64F-NEXT:    flw fs1, 84(a1)
 ; LP64F-NEXT:    flw fs2, 88(a1)
 ; LP64F-NEXT:    flw fs3, 92(a1)
-; LP64F-NEXT:    flw fs4, 112(a1)
-; LP64F-NEXT:    flw fs5, 116(a1)
-; LP64F-NEXT:    flw fs6, 120(a1)
-; LP64F-NEXT:    flw fs7, 124(a1)
-; LP64F-NEXT:    flw fs8, 96(a1)
-; LP64F-NEXT:    flw fs9, 100(a1)
-; LP64F-NEXT:    flw fs10, 104(a1)
-; LP64F-NEXT:    flw fs11, 108(a1)
-; LP64F-NEXT:    fsw fs7, 124(a1)
-; LP64F-NEXT:    fsw fs6, 120(a1)
-; LP64F-NEXT:    fsw fs5, 116(a1)
-; LP64F-NEXT:    fsw fs4, 112(a1)
-; LP64F-NEXT:    fsw fs11, 108(a1)
-; LP64F-NEXT:    fsw fs10, 104(a1)
-; LP64F-NEXT:    fsw fs9, 100(a1)
-; LP64F-NEXT:    fsw fs8, 96(a1)
+; LP64F-NEXT:    flw fs4, 96(a1)
+; LP64F-NEXT:    flw fs5, 100(a1)
+; LP64F-NEXT:    flw fs6, 104(a1)
+; LP64F-NEXT:    flw fs7, 108(a1)
+; LP64F-NEXT:    flw fs8, 112(a1)
+; LP64F-NEXT:    flw fs9, 116(a1)
+; LP64F-NEXT:    flw fs10, 120(a1)
+; LP64F-NEXT:    flw fs11, 124(a1)
+; LP64F-NEXT:    fsw fs11, 124(a1)
+; LP64F-NEXT:    fsw fs10, 120(a1)
+; LP64F-NEXT:    fsw fs9, 116(a1)
+; LP64F-NEXT:    fsw fs8, 112(a1)
+; LP64F-NEXT:    fsw fs7, 108(a1)
+; LP64F-NEXT:    fsw fs6, 104(a1)
+; LP64F-NEXT:    fsw fs5, 100(a1)
+; LP64F-NEXT:    fsw fs4, 96(a1)
 ; LP64F-NEXT:    fsw fs3, 92(a1)
 ; LP64F-NEXT:    fsw fs2, 88(a1)
 ; LP64F-NEXT:    fsw fs1, 84(a1)
@@ -538,22 +538,22 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    flw fs1, 84(a1)
 ; ILP32D-NEXT:    flw fs2, 88(a1)
 ; ILP32D-NEXT:    flw fs3, 92(a1)
-; ILP32D-NEXT:    flw fs4, 112(a1)
-; ILP32D-NEXT:    flw fs5, 116(a1)
-; ILP32D-NEXT:    flw fs6, 120(a1)
-; ILP32D-NEXT:    flw fs7, 124(a1)
-; ILP32D-NEXT:    flw fs8, 96(a1)
-; ILP32D-NEXT:    flw fs9, 100(a1)
-; ILP32D-NEXT:    flw fs10, 104(a1)
-; ILP32D-NEXT:    flw fs11, 108(a1)
-; ILP32D-NEXT:    fsw fs7, 124(a1)
-; ILP32D-NEXT:    fsw fs6, 120(a1)
-; ILP32D-NEXT:    fsw fs5, 116(a1)
-; ILP32D-NEXT:    fsw fs4, 112(a1)
-; ILP32D-NEXT:    fsw fs11, 108(a1)
-; ILP32D-NEXT:    fsw fs10, 104(a1)
-; ILP32D-NEXT:    fsw fs9, 100(a1)
-; ILP32D-NEXT:    fsw fs8, 96(a1)
+; ILP32D-NEXT:    flw fs4, 96(a1)
+; ILP32D-NEXT:    flw fs5, 100(a1)
+; ILP32D-NEXT:    flw fs6, 104(a1)
+; ILP32D-NEXT:    flw fs7, 108(a1)
+; ILP32D-NEXT:    flw fs8, 112(a1)
+; ILP32D-NEXT:    flw fs9, 116(a1)
+; ILP32D-NEXT:    flw fs10, 120(a1)
+; ILP32D-NEXT:    flw fs11, 124(a1)
+; ILP32D-NEXT:    fsw fs11, 124(a1)
+; ILP32D-NEXT:    fsw fs10, 120(a1)
+; ILP32D-NEXT:    fsw fs9, 116(a1)
+; ILP32D-NEXT:    fsw fs8, 112(a1)
+; ILP32D-NEXT:    fsw fs7, 108(a1)
+; ILP32D-NEXT:    fsw fs6, 104(a1)
+; ILP32D-NEXT:    fsw fs5, 100(a1)
+; ILP32D-NEXT:    fsw fs4, 96(a1)
 ; ILP32D-NEXT:    fsw fs3, 92(a1)
 ; ILP32D-NEXT:    fsw fs2, 88(a1)
 ; ILP32D-NEXT:    fsw fs1, 84(a1)
@@ -634,22 +634,22 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    flw fs1, 84(a1)
 ; LP64D-NEXT:    flw fs2, 88(a1)
 ; LP64D-NEXT:    flw fs3, 92(a1)
-; LP64D-NEXT:    flw fs4, 112(a1)
-; LP64D-NEXT:    flw fs5, 116(a1)
-; LP64D-NEXT:    flw fs6, 120(a1)
-; LP64D-NEXT:    flw fs7, 124(a1)
-; LP64D-NEXT:    flw fs8, 96(a1)
-; LP64D-NEXT:    flw fs9, 100(a1)
-; LP64D-NEXT:    flw fs10, 104(a1)
-; LP64D-NEXT:    flw fs11, 108(a1)
-; LP64D-NEXT:    fsw fs7, 124(a1)
-; LP64D-NEXT:    fsw fs6, 120(a1)
-; LP64D-NEXT:    fsw fs5, 116(a1)
-; LP64D-NEXT:    fsw fs4, 112(a1)
-; LP64D-NEXT:    fsw fs11, 108(a1)
-; LP64D-NEXT:    fsw fs10, 104(a1)
-; LP64D-NEXT:    fsw fs9, 100(a1)
-; LP64D-NEXT:    fsw fs8, 96(a1)
+; LP64D-NEXT:    flw fs4, 96(a1)
+; LP64D-NEXT:    flw fs5, 100(a1)
+; LP64D-NEXT:    flw fs6, 104(a1)
+; LP64D-NEXT:    flw fs7, 108(a1)
+; LP64D-NEXT:    flw fs8, 112(a1)
+; LP64D-NEXT:    flw fs9, 116(a1)
+; LP64D-NEXT:    flw fs10, 120(a1)
+; LP64D-NEXT:    flw fs11, 124(a1)
+; LP64D-NEXT:    fsw fs11, 124(a1)
+; LP64D-NEXT:    fsw fs10, 120(a1)
+; LP64D-NEXT:    fsw fs9, 116(a1)
+; LP64D-NEXT:    fsw fs8, 112(a1)
+; LP64D-NEXT:    fsw fs7, 108(a1)
+; LP64D-NEXT:    fsw fs6, 104(a1)
+; LP64D-NEXT:    fsw fs5, 100(a1)
+; LP64D-NEXT:    fsw fs4, 96(a1)
 ; LP64D-NEXT:    fsw fs3, 92(a1)
 ; LP64D-NEXT:    fsw fs2, 88(a1)
 ; LP64D-NEXT:    fsw fs1, 84(a1)
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
index 0501c700f57df..8a97e77bea55d 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
@@ -45,26 +45,26 @@ define void @callee() nounwind {
 ; ILP32-NEXT:    fld ft11, 152(a1)
 ; ILP32-NEXT:    fld fs0, 160(a1)
 ; ILP32-NEXT:    fld fs1, 168(a1)
-; ILP32-NEXT:    fld fs2, 208(a1)
-; ILP32-NEXT:    fld fs3, 216(a1)
-; ILP32-NEXT:    fld fs4, 224(a1)
-; ILP32-NEXT:    fld fs5, 232(a1)
-; ILP32-NEXT:    fld fs6, 240(a1)
-; ILP32-NEXT:    fld fs7, 248(a1)
-; ILP32-NEXT:    fld fs8, 176(a1)
-; ILP32-NEXT:    fld fs9, 184(a1)
-; ILP32-NEXT:    fld fs10, 192(a1)
-; ILP32-NEXT:    fld fs11, 200(a1)
-; ILP32-NEXT:    fsd fs7, 248(a1)
-; ILP32-NEXT:    fsd fs6, 240(a1)
-; ILP32-NEXT:    fsd fs5, 232(a1)
-; ILP32-NEXT:    fsd fs4, 224(a1)
-; ILP32-NEXT:    fsd fs3, 216(a1)
-; ILP32-NEXT:    fsd fs2, 208(a1)
-; ILP32-NEXT:    fsd fs11, 200(a1)
-; ILP32-NEXT:    fsd fs10, 192(a1)
-; ILP32-NEXT:    fsd fs9, 184(a1)
-; ILP32-NEXT:    fsd fs8, 176(a1)
+; ILP32-NEXT:    fld fs2, 176(a1)
+; ILP32-NEXT:    fld fs3, 184(a1)
+; ILP32-NEXT:    fld fs4, 192(a1)
+; ILP32-NEXT:    fld fs5, 200(a1)
+; ILP32-NEXT:    fld fs6, 208(a1)
+; ILP32-NEXT:    fld fs7, 216(a1)
+; ILP32-NEXT:    fld fs8, 224(a1)
+; ILP32-NEXT:    fld fs9, 232(a1)
+; ILP32-NEXT:    fld fs10, 240(a1)
+; ILP32-NEXT:    fld fs11, 248(a1)
+; ILP32-NEXT:    fsd fs11, 248(a1)
+; ILP32-NEXT:    fsd fs10, 240(a1)
+; ILP32-NEXT:    fsd fs9, 232(a1)
+; ILP32-NEXT:    fsd fs8, 224(a1)
+; ILP32-NEXT:    fsd fs7, 216(a1)
+; ILP32-NEXT:    fsd fs6, 208(a1)
+; ILP32-NEXT:    fsd fs5, 200(a1)
+; ILP32-NEXT:    fsd fs4, 192(a1)
+; ILP32-NEXT:    fsd fs3, 184(a1)
+; ILP32-NEXT:    fsd fs2, 176(a1)
 ; ILP32-NEXT:    fsd fs1, 168(a1)
 ; ILP32-NEXT:    fsd fs0, 160(a1)
 ; ILP32-NEXT:    fsd ft11, 152(a1)
@@ -115,26 +115,26 @@ define void @callee() nounwind {
 ; LP64-NEXT:    fld ft11, 152(a1)
 ; LP64-NEXT:    fld fs0, 160(a1)
 ; LP64-NEXT:    fld fs1, 168(a1)
-; LP64-NEXT:    fld fs2, 208(a1)
-; LP64-NEXT:    fld fs3, 216(a1)
-; LP64-NEXT:    fld fs4, 224(a1)
-; LP64-NEXT:    fld fs5, 232(a1)
-; LP64-NEXT:    fld fs6, 240(a1)
-; LP64-NEXT:    fld fs7, 248(a1)
-; LP64-NEXT:    fld fs8, 176(a1)
-; LP64-NEXT:    fld fs9, 184(a1)
-; LP64-NEXT:    fld fs10, 192(a1)
-; LP64-NEXT:    fld fs11, 200(a1)
-; LP64-NEXT:    fsd fs7, 248(a1)
-; LP64-NEXT:    fsd fs6, 240(a1)
-; LP64-NEXT:    fsd fs5, 232(a1)
-; LP64-NEXT:    fsd fs4, 224(a1)
-; LP64-NEXT:    fsd fs3, 216(a1)
-; LP64-NEXT:    fsd fs2, 208(a1)
-; LP64-NEXT:    fsd fs11, 200(a1)
-; LP64-NEXT:    fsd fs10, 192(a1)
-; LP64-NEXT:    fsd fs9, 184(a1)
-; LP64-NEXT:    fsd fs8, 176(a1)
+; LP64-NEXT:    fld fs2, 176(a1)
+; LP64-NEXT:    fld fs3, 184(a1)
+; LP64-NEXT:    fld fs4, 192(a1)
+; LP64-NEXT:    fld fs5, 200(a1)
+; LP64-NEXT:    fld fs6, 208(a1)
+; LP64-NEXT:    fld fs7, 216(a1)
+; LP64-NEXT:    fld fs8, 224(a1)
+; LP64-NEXT:    fld fs9, 232(a1)
+; LP64-NEXT:    fld fs10, 240(a1)
+; LP64-NEXT:    fld fs11, 248(a1)
+; LP64-NEXT:    fsd fs11, 248(a1)
+; LP64-NEXT:    fsd fs10, 240(a1)
+; LP64-NEXT:    fsd fs9, 232(a1)
+; LP64-NEXT:    fsd fs8, 224(a1)
+; LP64-NEXT:    fsd fs7, 216(a1)
+; LP64-NEXT:    fsd fs6, 208(a1)
+; LP64-NEXT:    fsd fs5, 200(a1)
+; LP64-NEXT:    fsd fs4, 192(a1)
+; LP64-NEXT:    fsd fs3, 184(a1)
+; LP64-NEXT:    fsd fs2, 176(a1)
 ; LP64-NEXT:    fsd fs1, 168(a1)
 ; LP64-NEXT:    fsd fs0, 160(a1)
 ; LP64-NEXT:    fsd ft11, 152(a1)
@@ -185,26 +185,26 @@ define void @callee() nounwind {
 ; LP64E-NEXT:    fld ft11, 152(a1)
 ; LP64E-NEXT:    fld fs0, 160(a1)
 ; LP64E-NEXT:    fld fs1, 168(a1)
-; LP64E-NEXT:    fld fs2, 208(a1)
-; LP64E-NEXT:    fld fs3, 216(a1)
-; LP64E-NEXT:    fld fs4, 224(a1)
-; LP64E-NEXT:    fld fs5, 232(a1)
-; LP64E-NEXT:    fld fs6, 240(a1)
-; LP64E-NEXT:    fld fs7, 248(a1)
-; LP64E-NEXT:    fld fs8, 176(a1)
-; LP64E-NEXT:    fld fs9, 184(a1)
-; LP64E-NEXT:    fld fs10, 192(a1)
-; LP64E-NEXT:    fld fs11, 200(a1)
-; LP64E-NEXT:    fsd fs7, 248(a1)
-; LP64E-NEXT:    fsd fs6, 240(a1)
-; LP64E-NEXT:    fsd fs5, 232(a1)
-; LP64E-NEXT:    fsd fs4, 224(a1)
-; LP64E-NEXT:    fsd fs3, 216(a1)
-; LP64E-NEXT:    fsd fs2, 208(a1)
-; LP64E-NEXT:    fsd fs11, 200(a1)
-; LP64E-NEXT:    fsd fs10, 192(a1)
-; LP64E-NEXT:    fsd fs9, 184(a1)
-; LP64E-NEXT:    fsd fs8, 176(a1)
+; LP64E-NEXT:    fld fs2, 176(a1)
+; LP64E-NEXT:    fld fs3, 184(a1)
+; LP64E-NEXT:    fld fs4, 192(a1)
+; LP64E-NEXT:    fld fs5, 200(a1)
+; LP64E-NEXT:    fld fs6, 208(a1)
+; LP64E-NEXT:    fld fs7, 216(a1)
+; LP64E-NEXT:    fld fs8, 224(a1)
+; LP64E-NEXT:    fld fs9, 232(a1)
+; LP64E-NEXT:    fld fs10, 240(a1)
+; LP64E-NEXT:    fld fs11, 248(a1)
+; LP64E-NEXT:    fsd fs11, 248(a1)
+; LP64E-NEXT:    fsd fs10, 240(a1)
+; LP64E-NEXT:    fsd fs9, 232(a1)
+; LP64E-NEXT:    fsd fs8, 224(a1)
+; LP64E-NEXT:    fsd fs7, 216(a1)
+; LP64E-NEXT:    fsd fs6, 208(a1)
+; LP64E-NEXT:    fsd fs5, 200(a1)
+; LP64E-NEXT:    fsd fs4, 192(a1)
+; LP64E-NEXT:    fsd fs3, 184(a1)
+; LP64E-NEXT:    fsd fs2, 176(a1)
 ; LP64E-NEXT:    fsd fs1, 168(a1)
 ; LP64E-NEXT:    fsd fs0, 160(a1)
 ; LP64E-NEXT:    fsd ft11, 152(a1)
@@ -268,26 +268,26 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fld ft11, 152(a1)
 ; ILP32D-NEXT:    fld fs0, 160(a1)
 ; ILP32D-NEXT:    fld fs1, 168(a1)
-; ILP32D-NEXT:    fld fs2, 208(a1)
-; ILP32D-NEXT:    fld fs3, 216(a1)
-; ILP32D-NEXT:    fld fs4, 224(a1)
-; ILP32D-NEXT:    fld fs5, 232(a1)
-; ILP32D-NEXT:    fld fs6, 240(a1)
-; ILP32D-NEXT:    fld fs7, 248(a1)
-; ILP32D-NEXT:    fld fs8, 176(a1)
-; ILP32D-NEXT:    fld fs9, 184(a1)
-; ILP32D-NEXT:    fld fs10, 192(a1)
-; ILP32D-NEXT:    fld fs11, 200(a1)
-; ILP32D-NEXT:    fsd fs7, 248(a1)
-; ILP32D-NEXT:    fsd fs6, 240(a1)
-; ILP32D-NEXT:    fsd fs5, 232(a1)
-; ILP32D-NEXT:    fsd fs4, 224(a1)
-; ILP32D-NEXT:    fsd fs3, 216(a1)
-; ILP32D-NEXT:    fsd fs2, 208(a1)
-; ILP32D-NEXT:    fsd fs11, 200(a1)
-; ILP32D-NEXT:    fsd fs10, 192(a1)
-; ILP32D-NEXT:    fsd fs9, 184(a1)
-; ILP32D-NEXT:    fsd fs8, 176(a1)
+; ILP32D-NEXT:    fld fs2, 176(a1)
+; ILP32D-NEXT:    fld fs3, 184(a1)
+; ILP32D-NEXT:    fld fs4, 192(a1)
+; ILP32D-NEXT:    fld fs5, 200(a1)
+; ILP32D-NEXT:    fld fs6, 208(a1)
+; ILP32D-NEXT:    fld fs7, 216(a1)
+; ILP32D-NEXT:    fld fs8, 224(a1)
+; ILP32D-NEXT:    fld fs9, 232(a1)
+; ILP32D-NEXT:    fld fs10, 240(a1)
+; ILP32D-NEXT:    fld fs11, 248(a1)
+; ILP32D-NEXT:    fsd fs11, 248(a1)
+; ILP32D-NEXT:    fsd fs10, 240(a1)
+; ILP32D-NEXT:    fsd fs9, 232(a1)
+; ILP32D-NEXT:    fsd fs8, 224(a1)
+; ILP32D-NEXT:    fsd fs7, 216(a1)
+; ILP32D-NEXT:    fsd fs6, 208(a1)
+; ILP32D-NEXT:    fsd fs5, 200(a1)
+; ILP32D-NEXT:    fsd fs4, 192(a1)
+; ILP32D-NEXT:    fsd fs3, 184(a1)
+; ILP32D-NEXT:    fsd fs2, 176(a1)
 ; ILP32D-NEXT:    fsd fs1, 168(a1)
 ; ILP32D-NEXT:    fsd fs0, 160(a1)
 ; ILP32D-NEXT:    fsd ft11, 152(a1)
@@ -364,26 +364,26 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fld ft11, 152(a1)
 ; LP64D-NEXT:    fld fs0, 160(a1)
 ; LP64D-NEXT:    fld fs1, 168(a1)
-; LP64D-NEXT:    fld fs2, 208(a1)
-; LP64D-NEXT:    fld fs3, 216(a1)
-; LP64D-NEXT:    fld fs4, 224(a1)
-; LP64D-NEXT:    fld fs5, 232(a1)
-; LP64D-NEXT:    fld fs6, 240(a1)
-; LP64D-NEXT:    fld fs7, 248(a1)
-; LP64D-NEXT:    fld fs8, 176(a1)
-; LP64D-NEXT:    fld fs9, 184(a1)
-; LP64D-NEXT:    fld fs10, 192(a1)
-; LP64D-NEXT:    fld fs11, 200(a1)
-; LP64D-NEXT:    fsd fs7, 248(a1)
-; LP64D-NEXT:    fsd fs6, 240(a1)
-; LP64D-NEXT:    fsd fs5, 232(a1)
-; LP64D-NEXT:    fsd fs4, 224(a1)
-; LP64D-NEXT:    fsd fs3, 216(a1)
-; LP64D-NEXT:    fsd fs2, 208(a1)
-; LP64D-NEXT:    fsd fs11, 200(a1)
-; LP64D-NEXT:    fsd fs10, 192(a1)
-; LP64D-NEXT:    fsd fs9, 184(a1)
-; LP64D-NEXT:    fsd fs8, 176(a1)
+; LP64D-NEXT:    fld fs2, 176(a1)
+; LP64D-NEXT:    fld fs3, 184(a1)
+; LP64D-NEXT:    fld fs4, 192(a1)
+; LP64D-NEXT:    fld fs5, 200(a1)
+; LP64D-NEXT:    fld fs6, 208(a1)
+; LP64D-NEXT:    fld fs7, 216(a1)
+; LP64D-NEXT:    fld fs8, 224(a1)
+; LP64D-NEXT:    fld fs9, 232(a1)
+; LP64D-NEXT:    fld fs10, 240(a1)
+; LP64D-NEXT:    fld fs11, 248(a1)
+; LP64D-NEXT:    fsd fs11, 248(a1)
+; LP64D-NEXT:    fsd fs10, 240(a1)
+; LP64D-NEXT:    fsd fs9, 232(a1)
+; LP64D-NEXT:    fsd fs8, 224(a1)
+; LP64D-NEXT:    fsd fs7, 216(a1)
+; LP64D-NEXT:    fsd fs6, 208(a1)
+; LP64D-NEXT:    fsd fs5, 200(a1)
+; LP64D-NEXT:    fsd fs4, 192(a1)
+; LP64D-NEXT:    fsd fs3, 184(a1)
+; LP64D-NEXT:    fsd fs2, 176(a1)
 ; LP64D-NEXT:    fsd fs1, 168(a1)
 ; LP64D-NEXT:    fsd fs0, 160(a1)
 ; LP64D-NEXT:    fsd ft11, 152(a1)
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index f9f1ba60a8ac0..53a4b1bafaab6 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -68,16 +68,16 @@ define void @callee() {
 ; RV32I-NEXT:    .cfi_offset s9, -44
 ; RV32I-NEXT:    .cfi_offset s10, -48
 ; RV32I-NEXT:    .cfi_offset s11, -52
-; RV32I-NEXT:    lui a7, %hi(var)
-; RV32I-NEXT:    lw a0, %lo(var)(a7)
+; RV32I-NEXT:    lui a4, %hi(var)
+; RV32I-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+4)(a7)
+; RV32I-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+8)(a7)
+; RV32I-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+12)(a7)
+; RV32I-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a7, %lo(var)
+; RV32I-NEXT:    addi a5, a4, %lo(var)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -100,22 +100,22 @@ define void @callee() {
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 112(a5)
-; RV32I-NEXT:    lw ra, 116(a5)
-; RV32I-NEXT:    lw a3, 120(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a6, 96(a5)
-; RV32I-NEXT:    lw a4, 100(a5)
-; RV32I-NEXT:    lw a2, 104(a5)
-; RV32I-NEXT:    lw a1, 108(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a3, 120(a5)
-; RV32I-NEXT:    sw ra, 116(a5)
-; RV32I-NEXT:    sw s11, 112(a5)
-; RV32I-NEXT:    sw a1, 108(a5)
-; RV32I-NEXT:    sw a2, 104(a5)
-; RV32I-NEXT:    sw a4, 100(a5)
-; RV32I-NEXT:    sw a6, 96(a5)
+; RV32I-NEXT:    lw s11, 96(a5)
+; RV32I-NEXT:    lw ra, 100(a5)
+; RV32I-NEXT:    lw a6, 104(a5)
+; RV32I-NEXT:    lw a3, 108(a5)
+; RV32I-NEXT:    lw a2, 112(a5)
+; RV32I-NEXT:    lw a1, 116(a5)
+; RV32I-NEXT:    lw a0, 120(a5)
+; RV32I-NEXT:    lw a7, 124(a5)
+; RV32I-NEXT:    sw a7, 124(a5)
+; RV32I-NEXT:    sw a0, 120(a5)
+; RV32I-NEXT:    sw a1, 116(a5)
+; RV32I-NEXT:    sw a2, 112(a5)
+; RV32I-NEXT:    sw a3, 108(a5)
+; RV32I-NEXT:    sw a6, 104(a5)
+; RV32I-NEXT:    sw ra, 100(a5)
+; RV32I-NEXT:    sw s11, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -139,13 +139,13 @@ define void @callee() {
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+12)(a7)
+; RV32I-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+8)(a7)
+; RV32I-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+4)(a7)
+; RV32I-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var)(a7)
+; RV32I-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -186,16 +186,16 @@ define void @callee() {
 ; RV32I-ILP32E-NEXT:    .cfi_offset ra, -4
 ; RV32I-ILP32E-NEXT:    .cfi_offset s0, -8
 ; RV32I-ILP32E-NEXT:    .cfi_offset s1, -12
-; RV32I-ILP32E-NEXT:    lui a7, %hi(var)
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a7)
+; RV32I-ILP32E-NEXT:    lui a4, %hi(var)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-ILP32E-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a7)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV32I-ILP32E-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a7)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV32I-ILP32E-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a7)
+; RV32I-ILP32E-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV32I-ILP32E-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-ILP32E-NEXT:    addi a5, a7, %lo(var)
+; RV32I-ILP32E-NEXT:    addi a5, a4, %lo(var)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(a5)
 ; RV32I-ILP32E-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32I-ILP32E-NEXT:    lw a0, 20(a5)
@@ -218,22 +218,22 @@ define void @callee() {
 ; RV32I-ILP32E-NEXT:    lw s10, 84(a5)
 ; RV32I-ILP32E-NEXT:    lw s11, 88(a5)
 ; RV32I-ILP32E-NEXT:    lw s0, 92(a5)
-; RV32I-ILP32E-NEXT:    lw s1, 112(a5)
-; RV32I-ILP32E-NEXT:    lw ra, 116(a5)
-; RV32I-ILP32E-NEXT:    lw a3, 120(a5)
-; RV32I-ILP32E-NEXT:    lw a0, 124(a5)
-; RV32I-ILP32E-NEXT:    lw a6, 96(a5)
-; RV32I-ILP32E-NEXT:    lw a4, 100(a5)
-; RV32I-ILP32E-NEXT:    lw a2, 104(a5)
-; RV32I-ILP32E-NEXT:    lw a1, 108(a5)
-; RV32I-ILP32E-NEXT:    sw a0, 124(a5)
-; RV32I-ILP32E-NEXT:    sw a3, 120(a5)
-; RV32I-ILP32E-NEXT:    sw ra, 116(a5)
-; RV32I-ILP32E-NEXT:    sw s1, 112(a5)
-; RV32I-ILP32E-NEXT:    sw a1, 108(a5)
-; RV32I-ILP32E-NEXT:    sw a2, 104(a5)
-; RV32I-ILP32E-NEXT:    sw a4, 100(a5)
-; RV32I-ILP32E-NEXT:    sw a6, 96(a5)
+; RV32I-ILP32E-NEXT:    lw s1, 96(a5)
+; RV32I-ILP32E-NEXT:    lw ra, 100(a5)
+; RV32I-ILP32E-NEXT:    lw a6, 104(a5)
+; RV32I-ILP32E-NEXT:    lw a3, 108(a5)
+; RV32I-ILP32E-NEXT:    lw a2, 112(a5)
+; RV32I-ILP32E-NEXT:    lw a1, 116(a5)
+; RV32I-ILP32E-NEXT:    lw a0, 120(a5)
+; RV32I-ILP32E-NEXT:    lw a7, 124(a5)
+; RV32I-ILP32E-NEXT:    sw a7, 124(a5)
+; RV32I-ILP32E-NEXT:    sw a0, 120(a5)
+; RV32I-ILP32E-NEXT:    sw a1, 116(a5)
+; RV32I-ILP32E-NEXT:    sw a2, 112(a5)
+; RV32I-ILP32E-NEXT:    sw a3, 108(a5)
+; RV32I-ILP32E-NEXT:    sw a6, 104(a5)
+; RV32I-ILP32E-NEXT:    sw ra, 100(a5)
+; RV32I-ILP32E-NEXT:    sw s1, 96(a5)
 ; RV32I-ILP32E-NEXT:    sw s0, 92(a5)
 ; RV32I-ILP32E-NEXT:    sw s11, 88(a5)
 ; RV32I-ILP32E-NEXT:    sw s10, 84(a5)
@@ -257,13 +257,13 @@ define void @callee() {
 ; RV32I-ILP32E-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    sw a0, 16(a5)
 ; RV32I-ILP32E-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a7)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV32I-ILP32E-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a7)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV32I-ILP32E-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a7)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV32I-ILP32E-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a7)
+; RV32I-ILP32E-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-ILP32E-NEXT:    lw ra, 32(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32I-ILP32E-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
@@ -306,16 +306,16 @@ define void @callee() {
 ; RV32I-WITH-FP-NEXT:    .cfi_offset s11, -52
 ; RV32I-WITH-FP-NEXT:    addi s0, sp, 80
 ; RV32I-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
-; RV32I-WITH-FP-NEXT:    lui t0, %hi(var)
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(t0)
+; RV32I-WITH-FP-NEXT:    lui a4, %hi(var)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(t0)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV32I-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(t0)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV32I-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(t0)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV32I-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    addi a5, t0, %lo(var)
+; RV32I-WITH-FP-NEXT:    addi a5, a4, %lo(var)
 ; RV32I-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -339,22 +339,22 @@ define void @callee() {
 ; RV32I-WITH-FP-NEXT:    lw s9, 84(a5)
 ; RV32I-WITH-FP-NEXT:    lw s10, 88(a5)
 ; RV32I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    lw ra, 112(a5)
-; RV32I-WITH-FP-NEXT:    lw a4, 116(a5)
-; RV32I-WITH-FP-NEXT:    lw a3, 120(a5)
-; RV32I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    lw a7, 96(a5)
-; RV32I-WITH-FP-NEXT:    lw a6, 100(a5)
-; RV32I-WITH-FP-NEXT:    lw a2, 104(a5)
-; RV32I-WITH-FP-NEXT:    lw a1, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    sw a3, 120(a5)
-; RV32I-WITH-FP-NEXT:    sw a4, 116(a5)
-; RV32I-WITH-FP-NEXT:    sw ra, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a1, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a2, 104(a5)
-; RV32I-WITH-FP-NEXT:    sw a6, 100(a5)
-; RV32I-WITH-FP-NEXT:    sw a7, 96(a5)
+; RV32I-WITH-FP-NEXT:    lw ra, 96(a5)
+; RV32I-WITH-FP-NEXT:    lw a7, 100(a5)
+; RV32I-WITH-FP-NEXT:    lw a6, 104(a5)
+; RV32I-WITH-FP-NEXT:    lw a3, 108(a5)
+; RV32I-WITH-FP-NEXT:    lw a2, 112(a5)
+; RV32I-WITH-FP-NEXT:    lw a1, 116(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 120(a5)
+; RV32I-WITH-FP-NEXT:    lw t0, 124(a5)
+; RV32I-WITH-FP-NEXT:    sw t0, 124(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 120(a5)
+; RV32I-WITH-FP-NEXT:    sw a1, 116(a5)
+; RV32I-WITH-FP-NEXT:    sw a2, 112(a5)
+; RV32I-WITH-FP-NEXT:    sw a3, 108(a5)
+; RV32I-WITH-FP-NEXT:    sw a6, 104(a5)
+; RV32I-WITH-FP-NEXT:    sw a7, 100(a5)
+; RV32I-WITH-FP-NEXT:    sw ra, 96(a5)
 ; RV32I-WITH-FP-NEXT:    sw s11, 92(a5)
 ; RV32I-WITH-FP-NEXT:    sw s10, 88(a5)
 ; RV32I-WITH-FP-NEXT:    sw s9, 84(a5)
@@ -379,13 +379,13 @@ define void @callee() {
 ; RV32I-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(t0)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV32I-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(t0)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV32I-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(t0)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV32I-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(t0)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-WITH-FP-NEXT:    .cfi_def_cfa sp, 80
 ; RV32I-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -434,16 +434,16 @@ define void @callee() {
 ; RV32IZCMP-NEXT:    .cfi_offset s9, -12
 ; RV32IZCMP-NEXT:    .cfi_offset s10, -8
 ; RV32IZCMP-NEXT:    .cfi_offset s11, -4
-; RV32IZCMP-NEXT:    lui t0, %hi(var)
-; RV32IZCMP-NEXT:    lw a0, %lo(var)(t0)
+; RV32IZCMP-NEXT:    lui a4, %hi(var)
+; RV32IZCMP-NEXT:    lw a0, %lo(var)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, t0, %lo(var)
+; RV32IZCMP-NEXT:    addi a5, a4, %lo(var)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -463,28 +463,28 @@ define void @callee() {
 ; RV32IZCMP-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-NEXT:    lw ra, 76(a5)
 ; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw a7, 112(a5)
-; RV32IZCMP-NEXT:    lw s0, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a6, 96(a5)
-; RV32IZCMP-NEXT:    lw a4, 100(a5)
-; RV32IZCMP-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-NEXT:    sw s0, 116(a5)
-; RV32IZCMP-NEXT:    sw a7, 112(a5)
-; RV32IZCMP-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-NEXT:    sw a4, 100(a5)
-; RV32IZCMP-NEXT:    sw a6, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
+; RV32IZCMP-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-NEXT:    lw t1, 88(a5)
+; RV32IZCMP-NEXT:    lw t0, 92(a5)
+; RV32IZCMP-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-NEXT:    lw s0, 100(a5)
+; RV32IZCMP-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-NEXT:    lw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-NEXT:    sw s0, 100(a5)
+; RV32IZCMP-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-NEXT:    sw t0, 92(a5)
+; RV32IZCMP-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-NEXT:    sw t2, 84(a5)
 ; RV32IZCMP-NEXT:    sw s1, 80(a5)
 ; RV32IZCMP-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-NEXT:    sw s11, 72(a5)
@@ -505,13 +505,13 @@ define void @callee() {
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var)(a4)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 80
 ;
 ; RV32IZCMP-WITH-FP-LABEL: callee:
@@ -546,16 +546,16 @@ define void @callee() {
 ; RV32IZCMP-WITH-FP-NEXT:    .cfi_offset s11, -52
 ; RV32IZCMP-WITH-FP-NEXT:    addi s0, sp, 80
 ; RV32IZCMP-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
-; RV32IZCMP-WITH-FP-NEXT:    lui t1, %hi(var)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    lui a4, %hi(var)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    addi a5, t1, %lo(var)
+; RV32IZCMP-WITH-FP-NEXT:    addi a5, a4, %lo(var)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -575,30 +575,30 @@ define void @callee() {
 ; RV32IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t3, 80(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t1, 88(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t0, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a4, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a7, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a6, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a4, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t0, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a6, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t0, 96(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a7, 100(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t4, 124(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t4, 124(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a7, 100(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t0, 96(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t2, 84(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw t3, 80(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
@@ -619,13 +619,13 @@ define void @callee() {
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(t1)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a4)
 ; RV32IZCMP-WITH-FP-NEXT:    .cfi_def_cfa sp, 80
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -687,16 +687,16 @@ define void @callee() {
 ; RV64I-NEXT:    .cfi_offset s9, -88
 ; RV64I-NEXT:    .cfi_offset s10, -96
 ; RV64I-NEXT:    .cfi_offset s11, -104
-; RV64I-NEXT:    lui a7, %hi(var)
-; RV64I-NEXT:    lw a0, %lo(var)(a7)
+; RV64I-NEXT:    lui a4, %hi(var)
+; RV64I-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+4)(a7)
+; RV64I-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+8)(a7)
+; RV64I-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+12)(a7)
+; RV64I-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a7, %lo(var)
+; RV64I-NEXT:    addi a5, a4, %lo(var)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -719,22 +719,22 @@ define void @callee() {
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 112(a5)
-; RV64I-NEXT:    lw ra, 116(a5)
-; RV64I-NEXT:    lw a3, 120(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a6, 96(a5)
-; RV64I-NEXT:    lw a4, 100(a5)
-; RV64I-NEXT:    lw a2, 104(a5)
-; RV64I-NEXT:    lw a1, 108(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a3, 120(a5)
-; RV64I-NEXT:    sw ra, 116(a5)
-; RV64I-NEXT:    sw s11, 112(a5)
-; RV64I-NEXT:    sw a1, 108(a5)
-; RV64I-NEXT:    sw a2, 104(a5)
-; RV64I-NEXT:    sw a4, 100(a5)
-; RV64I-NEXT:    sw a6, 96(a5)
+; RV64I-NEXT:    lw s11, 96(a5)
+; RV64I-NEXT:    lw ra, 100(a5)
+; RV64I-NEXT:    lw a6, 104(a5)
+; RV64I-NEXT:    lw a3, 108(a5)
+; RV64I-NEXT:    lw a2, 112(a5)
+; RV64I-NEXT:    lw a1, 116(a5)
+; RV64I-NEXT:    lw a0, 120(a5)
+; RV64I-NEXT:    lw a7, 124(a5)
+; RV64I-NEXT:    sw a7, 124(a5)
+; RV64I-NEXT:    sw a0, 120(a5)
+; RV64I-NEXT:    sw a1, 116(a5)
+; RV64I-NEXT:    sw a2, 112(a5)
+; RV64I-NEXT:    sw a3, 108(a5)
+; RV64I-NEXT:    sw a6, 104(a5)
+; RV64I-NEXT:    sw ra, 100(a5)
+; RV64I-NEXT:    sw s11, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -758,13 +758,13 @@ define void @callee() {
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+12)(a7)
+; RV64I-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+8)(a7)
+; RV64I-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+4)(a7)
+; RV64I-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var)(a7)
+; RV64I-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -805,16 +805,16 @@ define void @callee() {
 ; RV64I-LP64E-NEXT:    .cfi_offset ra, -8
 ; RV64I-LP64E-NEXT:    .cfi_offset s0, -16
 ; RV64I-LP64E-NEXT:    .cfi_offset s1, -24
-; RV64I-LP64E-NEXT:    lui a7, %hi(var)
-; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a7)
+; RV64I-LP64E-NEXT:    lui a4, %hi(var)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-LP64E-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a7)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV64I-LP64E-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a7)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV64I-LP64E-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a7)
+; RV64I-LP64E-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV64I-LP64E-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-LP64E-NEXT:    addi a5, a7, %lo(var)
+; RV64I-LP64E-NEXT:    addi a5, a4, %lo(var)
 ; RV64I-LP64E-NEXT:    lw a0, 16(a5)
 ; RV64I-LP64E-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64I-LP64E-NEXT:    lw a0, 20(a5)
@@ -837,22 +837,22 @@ define void @callee() {
 ; RV64I-LP64E-NEXT:    lw s10, 84(a5)
 ; RV64I-LP64E-NEXT:    lw s11, 88(a5)
 ; RV64I-LP64E-NEXT:    lw s0, 92(a5)
-; RV64I-LP64E-NEXT:    lw s1, 112(a5)
-; RV64I-LP64E-NEXT:    lw ra, 116(a5)
-; RV64I-LP64E-NEXT:    lw a3, 120(a5)
-; RV64I-LP64E-NEXT:    lw a0, 124(a5)
-; RV64I-LP64E-NEXT:    lw a6, 96(a5)
-; RV64I-LP64E-NEXT:    lw a4, 100(a5)
-; RV64I-LP64E-NEXT:    lw a2, 104(a5)
-; RV64I-LP64E-NEXT:    lw a1, 108(a5)
-; RV64I-LP64E-NEXT:    sw a0, 124(a5)
-; RV64I-LP64E-NEXT:    sw a3, 120(a5)
-; RV64I-LP64E-NEXT:    sw ra, 116(a5)
-; RV64I-LP64E-NEXT:    sw s1, 112(a5)
-; RV64I-LP64E-NEXT:    sw a1, 108(a5)
-; RV64I-LP64E-NEXT:    sw a2, 104(a5)
-; RV64I-LP64E-NEXT:    sw a4, 100(a5)
-; RV64I-LP64E-NEXT:    sw a6, 96(a5)
+; RV64I-LP64E-NEXT:    lw s1, 96(a5)
+; RV64I-LP64E-NEXT:    lw ra, 100(a5)
+; RV64I-LP64E-NEXT:    lw a6, 104(a5)
+; RV64I-LP64E-NEXT:    lw a3, 108(a5)
+; RV64I-LP64E-NEXT:    lw a2, 112(a5)
+; RV64I-LP64E-NEXT:    lw a1, 116(a5)
+; RV64I-LP64E-NEXT:    lw a0, 120(a5)
+; RV64I-LP64E-NEXT:    lw a7, 124(a5)
+; RV64I-LP64E-NEXT:    sw a7, 124(a5)
+; RV64I-LP64E-NEXT:    sw a0, 120(a5)
+; RV64I-LP64E-NEXT:    sw a1, 116(a5)
+; RV64I-LP64E-NEXT:    sw a2, 112(a5)
+; RV64I-LP64E-NEXT:    sw a3, 108(a5)
+; RV64I-LP64E-NEXT:    sw a6, 104(a5)
+; RV64I-LP64E-NEXT:    sw ra, 100(a5)
+; RV64I-LP64E-NEXT:    sw s1, 96(a5)
 ; RV64I-LP64E-NEXT:    sw s0, 92(a5)
 ; RV64I-LP64E-NEXT:    sw s11, 88(a5)
 ; RV64I-LP64E-NEXT:    sw s10, 84(a5)
@@ -876,13 +876,13 @@ define void @callee() {
 ; RV64I-LP64E-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    sw a0, 16(a5)
 ; RV64I-LP64E-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a7)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV64I-LP64E-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a7)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV64I-LP64E-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a7)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV64I-LP64E-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a7)
+; RV64I-LP64E-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-LP64E-NEXT:    ld ra, 64(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
 ; RV64I-LP64E-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
@@ -925,16 +925,16 @@ define void @callee() {
 ; RV64I-WITH-FP-NEXT:    .cfi_offset s11, -104
 ; RV64I-WITH-FP-NEXT:    addi s0, sp, 160
 ; RV64I-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
-; RV64I-WITH-FP-NEXT:    lui t0, %hi(var)
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(t0)
+; RV64I-WITH-FP-NEXT:    lui a4, %hi(var)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(t0)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV64I-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(t0)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV64I-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(t0)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV64I-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    addi a5, t0, %lo(var)
+; RV64I-WITH-FP-NEXT:    addi a5, a4, %lo(var)
 ; RV64I-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV64I-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -958,22 +958,22 @@ define void @callee() {
 ; RV64I-WITH-FP-NEXT:    lw s9, 84(a5)
 ; RV64I-WITH-FP-NEXT:    lw s10, 88(a5)
 ; RV64I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    lw ra, 112(a5)
-; RV64I-WITH-FP-NEXT:    lw a4, 116(a5)
-; RV64I-WITH-FP-NEXT:    lw a3, 120(a5)
-; RV64I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    lw a7, 96(a5)
-; RV64I-WITH-FP-NEXT:    lw a6, 100(a5)
-; RV64I-WITH-FP-NEXT:    lw a2, 104(a5)
-; RV64I-WITH-FP-NEXT:    lw a1, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    sw a3, 120(a5)
-; RV64I-WITH-FP-NEXT:    sw a4, 116(a5)
-; RV64I-WITH-FP-NEXT:    sw ra, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a1, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a2, 104(a5)
-; RV64I-WITH-FP-NEXT:    sw a6, 100(a5)
-; RV64I-WITH-FP-NEXT:    sw a7, 96(a5)
+; RV64I-WITH-FP-NEXT:    lw ra, 96(a5)
+; RV64I-WITH-FP-NEXT:    lw a7, 100(a5)
+; RV64I-WITH-FP-NEXT:    lw a6, 104(a5)
+; RV64I-WITH-FP-NEXT:    lw a3, 108(a5)
+; RV64I-WITH-FP-NEXT:    lw a2, 112(a5)
+; RV64I-WITH-FP-NEXT:    lw a1, 116(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 120(a5)
+; RV64I-WITH-FP-NEXT:    lw t0, 124(a5)
+; RV64I-WITH-FP-NEXT:    sw t0, 124(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 120(a5)
+; RV64I-WITH-FP-NEXT:    sw a1, 116(a5)
+; RV64I-WITH-FP-NEXT:    sw a2, 112(a5)
+; RV64I-WITH-FP-NEXT:    sw a3, 108(a5)
+; RV64I-WITH-FP-NEXT:    sw a6, 104(a5)
+; RV64I-WITH-FP-NEXT:    sw a7, 100(a5)
+; RV64I-WITH-FP-NEXT:    sw ra, 96(a5)
 ; RV64I-WITH-FP-NEXT:    sw s11, 92(a5)
 ; RV64I-WITH-FP-NEXT:    sw s10, 88(a5)
 ; RV64I-WITH-FP-NEXT:    sw s9, 84(a5)
@@ -998,13 +998,13 @@ define void @callee() {
 ; RV64I-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV64I-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(t0)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV64I-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(t0)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV64I-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(t0)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV64I-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(t0)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-WITH-FP-NEXT:    .cfi_def_cfa sp, 160
 ; RV64I-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
@@ -1053,16 +1053,16 @@ define void @callee() {
 ; RV64IZCMP-NEXT:    .cfi_offset s9, -24
 ; RV64IZCMP-NEXT:    .cfi_offset s10, -16
 ; RV64IZCMP-NEXT:    .cfi_offset s11, -8
-; RV64IZCMP-NEXT:    lui t0, %hi(var)
-; RV64IZCMP-NEXT:    lw a0, %lo(var)(t0)
+; RV64IZCMP-NEXT:    lui a4, %hi(var)
+; RV64IZCMP-NEXT:    lw a0, %lo(var)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, t0, %lo(var)
+; RV64IZCMP-NEXT:    addi a5, a4, %lo(var)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -1082,28 +1082,28 @@ define void @callee() {
 ; RV64IZCMP-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-NEXT:    lw ra, 76(a5)
 ; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw a7, 112(a5)
-; RV64IZCMP-NEXT:    lw s0, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a6, 96(a5)
-; RV64IZCMP-NEXT:    lw a4, 100(a5)
-; RV64IZCMP-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-NEXT:    sw s0, 116(a5)
-; RV64IZCMP-NEXT:    sw a7, 112(a5)
-; RV64IZCMP-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-NEXT:    sw a4, 100(a5)
-; RV64IZCMP-NEXT:    sw a6, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
+; RV64IZCMP-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-NEXT:    lw t1, 88(a5)
+; RV64IZCMP-NEXT:    lw t0, 92(a5)
+; RV64IZCMP-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-NEXT:    lw s0, 100(a5)
+; RV64IZCMP-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-NEXT:    lw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-NEXT:    sw s0, 100(a5)
+; RV64IZCMP-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-NEXT:    sw t0, 92(a5)
+; RV64IZCMP-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-NEXT:    sw t2, 84(a5)
 ; RV64IZCMP-NEXT:    sw s1, 80(a5)
 ; RV64IZCMP-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-NEXT:    sw s11, 72(a5)
@@ -1124,13 +1124,13 @@ define void @callee() {
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var)(a4)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV64IZCMP-WITH-FP-LABEL: callee:
@@ -1165,16 +1165,16 @@ define void @callee() {
 ; RV64IZCMP-WITH-FP-NEXT:    .cfi_offset s11, -104
 ; RV64IZCMP-WITH-FP-NEXT:    addi s0, sp, 160
 ; RV64IZCMP-WITH-FP-NEXT:    .cfi_def_cfa s0, 0
-; RV64IZCMP-WITH-FP-NEXT:    lui t1, %hi(var)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    lui a4, %hi(var)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    addi a5, t1, %lo(var)
+; RV64IZCMP-WITH-FP-NEXT:    addi a5, a4, %lo(var)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
@@ -1194,30 +1194,30 @@ define void @callee() {
 ; RV64IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t3, 80(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t1, 88(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t0, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a4, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a7, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a6, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a4, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t0, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a6, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t0, 96(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a7, 100(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t4, 124(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t4, 124(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a7, 100(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t0, 96(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t2, 84(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw t3, 80(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
@@ -1238,13 +1238,13 @@ define void @callee() {
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(t1)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a4)
 ; RV64IZCMP-WITH-FP-NEXT:    .cfi_def_cfa sp, 160
 ; RV64IZCMP-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
index 541c9b4d40c7e..aa08c3f5c95b1 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
@@ -225,8 +225,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu a0, 16(sp)
 ; RV32I-NEXT:    mv s0, a7
+; RV32I-NEXT:    lhu a0, 16(sp)
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    call __fixsfsi
 ; RV32I-NEXT:    add a0, s0, a0
@@ -240,8 +240,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu a0, 16(sp)
 ; RV64I-NEXT:    mv s0, a7
+; RV64I-NEXT:    lhu a0, 16(sp)
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    call __fixsfdi
 ; RV64I-NEXT:    addw a0, s0, a0
@@ -255,8 +255,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    lhu a0, 16(sp)
 ; RV32IF-NEXT:    mv s0, a7
+; RV32IF-NEXT:    lhu a0, 16(sp)
 ; RV32IF-NEXT:    call __extendhfsf2
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fcvt.w.s a0, fa5, rtz
@@ -271,8 +271,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f,
 ; RV64IF-NEXT:    addi sp, sp, -16
 ; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64IF-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64IF-NEXT:    lhu a0, 16(sp)
 ; RV64IF-NEXT:    mv s0, a7
+; RV64IF-NEXT:    lhu a0, 16(sp)
 ; RV64IF-NEXT:    call __extendhfsf2
 ; RV64IF-NEXT:    fmv.w.x fa5, a0
 ; RV64IF-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -341,9 +341,9 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 6
 ; RV32I-NEXT:    li a6, 7
-; RV32I-NEXT:    addi t0, a7, -1792
+; RV32I-NEXT:    addi a7, a7, -1792
+; RV32I-NEXT:    sw a7, 0(sp)
 ; RV32I-NEXT:    li a7, 8
-; RV32I-NEXT:    sw t0, 0(sp)
 ; RV32I-NEXT:    call callee_half_on_stack
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -361,9 +361,9 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
 ; RV64I-NEXT:    li a6, 7
-; RV64I-NEXT:    addiw t0, a7, -1792
+; RV64I-NEXT:    addiw a7, a7, -1792
+; RV64I-NEXT:    sd a7, 0(sp)
 ; RV64I-NEXT:    li a7, 8
-; RV64I-NEXT:    sd t0, 0(sp)
 ; RV64I-NEXT:    call callee_half_on_stack
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -381,9 +381,9 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32IF-NEXT:    li a4, 5
 ; RV32IF-NEXT:    li a5, 6
 ; RV32IF-NEXT:    li a6, 7
-; RV32IF-NEXT:    addi t0, a7, -1792
+; RV32IF-NEXT:    addi a7, a7, -1792
+; RV32IF-NEXT:    sw a7, 0(sp)
 ; RV32IF-NEXT:    li a7, 8
-; RV32IF-NEXT:    sw t0, 0(sp)
 ; RV32IF-NEXT:    call callee_half_on_stack
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
@@ -401,9 +401,9 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF-NEXT:    li a4, 5
 ; RV64IF-NEXT:    li a5, 6
 ; RV64IF-NEXT:    li a6, 7
-; RV64IF-NEXT:    addi t0, a7, -1792
+; RV64IF-NEXT:    addi a7, a7, -1792
+; RV64IF-NEXT:    sw a7, 0(sp)
 ; RV64IF-NEXT:    li a7, 8
-; RV64IF-NEXT:    sw t0, 0(sp)
 ; RV64IF-NEXT:    call callee_half_on_stack
 ; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IF-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
index 9387b7ef4c32e..6697cd0e503e7 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
@@ -94,15 +94,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
-; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a2, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 0(sp)
 ; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
 ; RV32I-FPELIM-NEXT:    add a0, a0, a7
-; RV32I-FPELIM-NEXT:    add a1, a2, a1
+; RV32I-FPELIM-NEXT:    add a2, a3, a2
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
+; RV32I-FPELIM-NEXT:    add a1, a4, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    add a3, a4, a3
-; RV32I-FPELIM-NEXT:    add a0, a0, a3
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -112,15 +112,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a1, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a2, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 0(s0)
 ; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    add a0, a0, a7
-; RV32I-WITHFP-NEXT:    add a1, a2, a1
+; RV32I-WITHFP-NEXT:    add a2, a3, a2
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
+; RV32I-WITHFP-NEXT:    add a1, a4, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    add a3, a4, a3
-; RV32I-WITHFP-NEXT:    add a0, a0, a3
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
@@ -145,45 +145,43 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a5, 18
-; RV32I-FPELIM-NEXT:    li a6, 17
-; RV32I-FPELIM-NEXT:    li a7, 16
-; RV32I-FPELIM-NEXT:    lui t0, 262236
-; RV32I-FPELIM-NEXT:    lui t1, 377487
-; RV32I-FPELIM-NEXT:    li t2, 15
-; RV32I-FPELIM-NEXT:    lui t3, 262153
-; RV32I-FPELIM-NEXT:    lui t4, 545260
-; RV32I-FPELIM-NEXT:    lui t5, 964690
-; RV32I-FPELIM-NEXT:    lui t6, 335544
-; RV32I-FPELIM-NEXT:    lui s0, 688509
+; RV32I-FPELIM-NEXT:    li a4, 18
+; RV32I-FPELIM-NEXT:    li a5, 17
+; RV32I-FPELIM-NEXT:    li a6, 16
+; RV32I-FPELIM-NEXT:    lui a7, 262236
+; RV32I-FPELIM-NEXT:    lui t0, 377487
+; RV32I-FPELIM-NEXT:    li t1, 15
+; RV32I-FPELIM-NEXT:    lui t2, 262153
+; RV32I-FPELIM-NEXT:    lui t3, 545260
+; RV32I-FPELIM-NEXT:    lui t4, 964690
+; RV32I-FPELIM-NEXT:    lui t5, 335544
+; RV32I-FPELIM-NEXT:    lui t6, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
+; RV32I-FPELIM-NEXT:    sw a5, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a4, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a4, 13
-; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
-; RV32I-FPELIM-NEXT:    li a6, 4
-; RV32I-FPELIM-NEXT:    addi a5, t0, 655
-; RV32I-FPELIM-NEXT:    addi t0, t1, 1475
-; RV32I-FPELIM-NEXT:    sw t2, 0(sp)
-; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    addi a5, a7, 655
+; RV32I-FPELIM-NEXT:    addi a7, t0, 1475
+; RV32I-FPELIM-NEXT:    sw t1, 0(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 8(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
+; RV32I-FPELIM-NEXT:    sw a6, 16(sp)
+; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    addi a7, t2, 491
+; RV32I-FPELIM-NEXT:    addi t0, t3, -1967
+; RV32I-FPELIM-NEXT:    addi t1, t4, -328
+; RV32I-FPELIM-NEXT:    addi t2, t5, 1311
+; RV32I-FPELIM-NEXT:    addi a5, t6, -2048
+; RV32I-FPELIM-NEXT:    sw t2, 32(sp)
+; RV32I-FPELIM-NEXT:    sw t1, 36(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 40(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
-; RV32I-FPELIM-NEXT:    addi t0, t3, 491
-; RV32I-FPELIM-NEXT:    addi t1, t4, -1967
-; RV32I-FPELIM-NEXT:    addi t2, t5, -328
-; RV32I-FPELIM-NEXT:    addi t3, t6, 1311
-; RV32I-FPELIM-NEXT:    addi a5, s0, -2048
-; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
-; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
-; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
-; RV32I-FPELIM-NEXT:    sw t0, 44(sp)
 ; RV32I-FPELIM-NEXT:    call callee_aligned_stack
 ; RV32I-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32I-FPELIM-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 64
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -192,47 +190,45 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    addi sp, sp, -64
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32I-WITHFP-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a5, 18
-; RV32I-WITHFP-NEXT:    li a6, 17
-; RV32I-WITHFP-NEXT:    li a7, 16
-; RV32I-WITHFP-NEXT:    lui t0, 262236
-; RV32I-WITHFP-NEXT:    lui t1, 377487
-; RV32I-WITHFP-NEXT:    li t2, 15
-; RV32I-WITHFP-NEXT:    lui t3, 262153
-; RV32I-WITHFP-NEXT:    lui t4, 545260
-; RV32I-WITHFP-NEXT:    lui t5, 964690
-; RV32I-WITHFP-NEXT:    lui t6, 335544
-; RV32I-WITHFP-NEXT:    lui s1, 688509
+; RV32I-WITHFP-NEXT:    li a4, 18
+; RV32I-WITHFP-NEXT:    li a5, 17
+; RV32I-WITHFP-NEXT:    li a6, 16
+; RV32I-WITHFP-NEXT:    lui a7, 262236
+; RV32I-WITHFP-NEXT:    lui t0, 377487
+; RV32I-WITHFP-NEXT:    li t1, 15
+; RV32I-WITHFP-NEXT:    lui t2, 262153
+; RV32I-WITHFP-NEXT:    lui t3, 545260
+; RV32I-WITHFP-NEXT:    lui t4, 964690
+; RV32I-WITHFP-NEXT:    lui t5, 335544
+; RV32I-WITHFP-NEXT:    lui t6, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
+; RV32I-WITHFP-NEXT:    sw a5, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a4, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a4, 13
-; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
-; RV32I-WITHFP-NEXT:    li a6, 4
-; RV32I-WITHFP-NEXT:    addi a5, t0, 655
-; RV32I-WITHFP-NEXT:    addi t0, t1, 1475
-; RV32I-WITHFP-NEXT:    sw t2, 0(sp)
-; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    addi a5, a7, 655
+; RV32I-WITHFP-NEXT:    addi a7, t0, 1475
+; RV32I-WITHFP-NEXT:    sw t1, 0(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 8(sp)
 ; RV32I-WITHFP-NEXT:    sw a5, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
+; RV32I-WITHFP-NEXT:    sw a6, 16(sp)
+; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    addi a7, t2, 491
+; RV32I-WITHFP-NEXT:    addi t0, t3, -1967
+; RV32I-WITHFP-NEXT:    addi t1, t4, -328
+; RV32I-WITHFP-NEXT:    addi t2, t5, 1311
+; RV32I-WITHFP-NEXT:    addi a5, t6, -2048
+; RV32I-WITHFP-NEXT:    sw t2, -32(s0)
+; RV32I-WITHFP-NEXT:    sw t1, -28(s0)
+; RV32I-WITHFP-NEXT:    sw t0, -24(s0)
+; RV32I-WITHFP-NEXT:    sw a7, -20(s0)
 ; RV32I-WITHFP-NEXT:    li a7, 14
-; RV32I-WITHFP-NEXT:    addi t0, t3, 491
-; RV32I-WITHFP-NEXT:    addi t1, t4, -1967
-; RV32I-WITHFP-NEXT:    addi t2, t5, -328
-; RV32I-WITHFP-NEXT:    addi t3, t6, 1311
-; RV32I-WITHFP-NEXT:    addi a5, s1, -2048
-; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
-; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
-; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
-; RV32I-WITHFP-NEXT:    sw t0, -20(s0)
 ; RV32I-WITHFP-NEXT:    call callee_aligned_stack
 ; RV32I-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32I-WITHFP-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 64
 ; RV32I-WITHFP-NEXT:    ret
   %1 = call i32 @callee_aligned_stack(i32 1, i32 11,
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 18916dd69eb43..f54e86b497945 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -149,9 +149,9 @@ define i32 @caller_many_scalars() nounwind {
 ; RV32I-FPELIM-NEXT:    li a3, 4
 ; RV32I-FPELIM-NEXT:    li a5, 5
 ; RV32I-FPELIM-NEXT:    li a6, 6
-; RV32I-FPELIM-NEXT:    li a7, 7
 ; RV32I-FPELIM-NEXT:    sw zero, 0(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 4(sp)
+; RV32I-FPELIM-NEXT:    li a7, 7
 ; RV32I-FPELIM-NEXT:    li a4, 0
 ; RV32I-FPELIM-NEXT:    call callee_many_scalars
 ; RV32I-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -171,9 +171,9 @@ define i32 @caller_many_scalars() nounwind {
 ; RV32I-WITHFP-NEXT:    li a3, 4
 ; RV32I-WITHFP-NEXT:    li a5, 5
 ; RV32I-WITHFP-NEXT:    li a6, 6
-; RV32I-WITHFP-NEXT:    li a7, 7
 ; RV32I-WITHFP-NEXT:    sw zero, 0(sp)
 ; RV32I-WITHFP-NEXT:    sw a4, 4(sp)
+; RV32I-WITHFP-NEXT:    li a7, 7
 ; RV32I-WITHFP-NEXT:    li a4, 0
 ; RV32I-WITHFP-NEXT:    call callee_many_scalars
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -194,17 +194,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-NEXT:    lw a3, 4(a1)
 ; RV32I-FPELIM-NEXT:    lw a4, 8(a1)
 ; RV32I-FPELIM-NEXT:    lw a1, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32I-FPELIM-NEXT:    xor a1, a5, a1
-; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    xor a0, a0, a2
-; RV32I-FPELIM-NEXT:    or a1, a3, a1
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
-; RV32I-FPELIM-NEXT:    or a0, a0, a1
+; RV32I-FPELIM-NEXT:    lw a0, 12(a0)
+; RV32I-FPELIM-NEXT:    xor a0, a0, a1
+; RV32I-FPELIM-NEXT:    xor a1, a6, a3
+; RV32I-FPELIM-NEXT:    xor a3, a7, a4
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a2, a2, a3
+; RV32I-FPELIM-NEXT:    or a0, a2, a0
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -218,17 +218,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    lw a3, 4(a1)
 ; RV32I-WITHFP-NEXT:    lw a4, 8(a1)
 ; RV32I-WITHFP-NEXT:    lw a1, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
-; RV32I-WITHFP-NEXT:    xor a1, a5, a1
-; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    xor a0, a0, a2
-; RV32I-WITHFP-NEXT:    or a1, a3, a1
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
-; RV32I-WITHFP-NEXT:    or a0, a0, a1
+; RV32I-WITHFP-NEXT:    lw a0, 12(a0)
+; RV32I-WITHFP-NEXT:    xor a0, a0, a1
+; RV32I-WITHFP-NEXT:    xor a1, a6, a3
+; RV32I-WITHFP-NEXT:    xor a3, a7, a4
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a2, a2, a3
+; RV32I-WITHFP-NEXT:    or a0, a2, a0
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -245,18 +245,18 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
 ; RV32I-FPELIM-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    lui a1, 524272
-; RV32I-FPELIM-NEXT:    li a2, 1
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
+; RV32I-FPELIM-NEXT:    lui a0, 524272
+; RV32I-FPELIM-NEXT:    li a1, 1
 ; RV32I-FPELIM-NEXT:    sw zero, 0(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 4(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32I-FPELIM-NEXT:    mv a1, sp
-; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
+; RV32I-FPELIM-NEXT:    addi a0, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 24(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 36(sp)
+; RV32I-FPELIM-NEXT:    mv a1, sp
 ; RV32I-FPELIM-NEXT:    call callee_large_scalars
 ; RV32I-FPELIM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
@@ -268,18 +268,18 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 48
-; RV32I-WITHFP-NEXT:    lui a1, 524272
-; RV32I-WITHFP-NEXT:    li a2, 1
-; RV32I-WITHFP-NEXT:    addi a0, s0, -24
+; RV32I-WITHFP-NEXT:    lui a0, 524272
+; RV32I-WITHFP-NEXT:    li a1, 1
 ; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw a1, -36(s0)
-; RV32I-WITHFP-NEXT:    addi a1, s0, -48
-; RV32I-WITHFP-NEXT:    sw a2, -24(s0)
+; RV32I-WITHFP-NEXT:    sw a0, -36(s0)
+; RV32I-WITHFP-NEXT:    addi a0, s0, -24
+; RV32I-WITHFP-NEXT:    sw a1, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -12(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, -48
 ; RV32I-WITHFP-NEXT:    call callee_large_scalars
 ; RV32I-WITHFP-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -301,17 +301,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
 ; RV32I-FPELIM-NEXT:    lw a3, 8(a7)
 ; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32I-FPELIM-NEXT:    xor a4, a4, a5
+; RV32I-FPELIM-NEXT:    lw a0, 12(a0)
+; RV32I-FPELIM-NEXT:    xor a0, a4, a0
 ; RV32I-FPELIM-NEXT:    xor a2, a2, a6
 ; RV32I-FPELIM-NEXT:    xor a3, a3, a7
-; RV32I-FPELIM-NEXT:    xor a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a2, a2, a4
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
-; RV32I-FPELIM-NEXT:    or a0, a0, a2
+; RV32I-FPELIM-NEXT:    xor a1, a1, a5
+; RV32I-FPELIM-NEXT:    or a0, a2, a0
+; RV32I-FPELIM-NEXT:    or a1, a1, a3
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -326,17 +326,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
 ; RV32I-WITHFP-NEXT:    lw a3, 8(a7)
 ; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
-; RV32I-WITHFP-NEXT:    xor a4, a4, a5
+; RV32I-WITHFP-NEXT:    lw a0, 12(a0)
+; RV32I-WITHFP-NEXT:    xor a0, a4, a0
 ; RV32I-WITHFP-NEXT:    xor a2, a2, a6
 ; RV32I-WITHFP-NEXT:    xor a3, a3, a7
-; RV32I-WITHFP-NEXT:    xor a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a2, a2, a4
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
-; RV32I-WITHFP-NEXT:    or a0, a0, a2
+; RV32I-WITHFP-NEXT:    xor a1, a1, a5
+; RV32I-WITHFP-NEXT:    or a0, a2, a0
+; RV32I-WITHFP-NEXT:    or a1, a1, a3
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -353,28 +353,28 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    addi a6, sp, 16
-; RV32I-FPELIM-NEXT:    li a7, 9
-; RV32I-FPELIM-NEXT:    lui t0, 524272
-; RV32I-FPELIM-NEXT:    li t1, 8
+; RV32I-FPELIM-NEXT:    addi a5, sp, 16
+; RV32I-FPELIM-NEXT:    li a6, 9
+; RV32I-FPELIM-NEXT:    lui a7, 524272
+; RV32I-FPELIM-NEXT:    li t0, 8
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 2
 ; RV32I-FPELIM-NEXT:    li a2, 3
 ; RV32I-FPELIM-NEXT:    li a3, 4
 ; RV32I-FPELIM-NEXT:    li a4, 5
+; RV32I-FPELIM-NEXT:    sw a6, 0(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 4(sp)
 ; RV32I-FPELIM-NEXT:    li a5, 6
-; RV32I-FPELIM-NEXT:    sw a7, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a6, 4(sp)
-; RV32I-FPELIM-NEXT:    li a6, 7
 ; RV32I-FPELIM-NEXT:    sw zero, 16(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 20(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 24(sp)
-; RV32I-FPELIM-NEXT:    sw t0, 28(sp)
-; RV32I-FPELIM-NEXT:    addi a7, sp, 40
-; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 28(sp)
+; RV32I-FPELIM-NEXT:    li a6, 7
+; RV32I-FPELIM-NEXT:    sw t0, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 48(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 52(sp)
+; RV32I-FPELIM-NEXT:    addi a7, sp, 40
 ; RV32I-FPELIM-NEXT:    call callee_large_scalars_exhausted_regs
 ; RV32I-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 64
@@ -386,28 +386,28 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    addi a6, s0, -48
-; RV32I-WITHFP-NEXT:    li a7, 9
-; RV32I-WITHFP-NEXT:    lui t0, 524272
-; RV32I-WITHFP-NEXT:    li t1, 8
+; RV32I-WITHFP-NEXT:    addi a5, s0, -48
+; RV32I-WITHFP-NEXT:    li a6, 9
+; RV32I-WITHFP-NEXT:    lui a7, 524272
+; RV32I-WITHFP-NEXT:    li t0, 8
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 2
 ; RV32I-WITHFP-NEXT:    li a2, 3
 ; RV32I-WITHFP-NEXT:    li a3, 4
 ; RV32I-WITHFP-NEXT:    li a4, 5
+; RV32I-WITHFP-NEXT:    sw a6, 0(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 4(sp)
 ; RV32I-WITHFP-NEXT:    li a5, 6
-; RV32I-WITHFP-NEXT:    sw a7, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a6, 4(sp)
-; RV32I-WITHFP-NEXT:    li a6, 7
 ; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw t0, -36(s0)
-; RV32I-WITHFP-NEXT:    addi a7, s0, -24
-; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
+; RV32I-WITHFP-NEXT:    sw a7, -36(s0)
+; RV32I-WITHFP-NEXT:    li a6, 7
+; RV32I-WITHFP-NEXT:    sw t0, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -12(s0)
+; RV32I-WITHFP-NEXT:    addi a7, s0, -24
 ; RV32I-WITHFP-NEXT:    call callee_large_scalars_exhausted_regs
 ; RV32I-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
@@ -614,15 +614,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
-; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a2, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 0(sp)
 ; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
 ; RV32I-FPELIM-NEXT:    add a0, a0, a7
-; RV32I-FPELIM-NEXT:    add a1, a2, a1
+; RV32I-FPELIM-NEXT:    add a2, a3, a2
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
+; RV32I-FPELIM-NEXT:    add a1, a4, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    add a3, a4, a3
-; RV32I-FPELIM-NEXT:    add a0, a0, a3
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -632,15 +632,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a1, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a2, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 0(s0)
 ; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    add a0, a0, a7
-; RV32I-WITHFP-NEXT:    add a1, a2, a1
+; RV32I-WITHFP-NEXT:    add a2, a3, a2
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
+; RV32I-WITHFP-NEXT:    add a1, a4, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    add a3, a4, a3
-; RV32I-WITHFP-NEXT:    add a0, a0, a3
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16
@@ -664,38 +664,38 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a5, 19
-; RV32I-FPELIM-NEXT:    li a6, 18
-; RV32I-FPELIM-NEXT:    li a7, 17
-; RV32I-FPELIM-NEXT:    li t0, 16
-; RV32I-FPELIM-NEXT:    li t1, 15
-; RV32I-FPELIM-NEXT:    lui t2, 262153
-; RV32I-FPELIM-NEXT:    lui t3, 545260
-; RV32I-FPELIM-NEXT:    lui t4, 964690
-; RV32I-FPELIM-NEXT:    lui t5, 335544
-; RV32I-FPELIM-NEXT:    lui t6, 688509
+; RV32I-FPELIM-NEXT:    li a4, 19
+; RV32I-FPELIM-NEXT:    li a5, 18
+; RV32I-FPELIM-NEXT:    li a6, 17
+; RV32I-FPELIM-NEXT:    li a7, 16
+; RV32I-FPELIM-NEXT:    li t0, 15
+; RV32I-FPELIM-NEXT:    lui t1, 262153
+; RV32I-FPELIM-NEXT:    lui t2, 545260
+; RV32I-FPELIM-NEXT:    lui t3, 964690
+; RV32I-FPELIM-NEXT:    lui t4, 335544
+; RV32I-FPELIM-NEXT:    lui t5, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
+; RV32I-FPELIM-NEXT:    sw a5, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a4, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a4, 13
-; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
-; RV32I-FPELIM-NEXT:    li a6, 4
-; RV32I-FPELIM-NEXT:    sw t1, 0(sp)
-; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 0(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 8(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
+; RV32I-FPELIM-NEXT:    sw a6, 16(sp)
+; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    addi a7, t1, 491
+; RV32I-FPELIM-NEXT:    addi t0, t2, -1967
+; RV32I-FPELIM-NEXT:    addi t1, t3, -328
+; RV32I-FPELIM-NEXT:    addi t2, t4, 1311
+; RV32I-FPELIM-NEXT:    addi a5, t5, -2048
+; RV32I-FPELIM-NEXT:    sw t2, 32(sp)
+; RV32I-FPELIM-NEXT:    sw t1, 36(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 40(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
-; RV32I-FPELIM-NEXT:    addi t0, t2, 491
-; RV32I-FPELIM-NEXT:    addi t1, t3, -1967
-; RV32I-FPELIM-NEXT:    addi t2, t4, -328
-; RV32I-FPELIM-NEXT:    addi t3, t5, 1311
-; RV32I-FPELIM-NEXT:    addi a5, t6, -2048
-; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
-; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
-; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
-; RV32I-FPELIM-NEXT:    sw t0, 44(sp)
 ; RV32I-FPELIM-NEXT:    call callee_aligned_stack
 ; RV32I-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 64
@@ -707,38 +707,38 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a5, 19
-; RV32I-WITHFP-NEXT:    li a6, 18
-; RV32I-WITHFP-NEXT:    li a7, 17
-; RV32I-WITHFP-NEXT:    li t0, 16
-; RV32I-WITHFP-NEXT:    li t1, 15
-; RV32I-WITHFP-NEXT:    lui t2, 262153
-; RV32I-WITHFP-NEXT:    lui t3, 545260
-; RV32I-WITHFP-NEXT:    lui t4, 964690
-; RV32I-WITHFP-NEXT:    lui t5, 335544
-; RV32I-WITHFP-NEXT:    lui t6, 688509
+; RV32I-WITHFP-NEXT:    li a4, 19
+; RV32I-WITHFP-NEXT:    li a5, 18
+; RV32I-WITHFP-NEXT:    li a6, 17
+; RV32I-WITHFP-NEXT:    li a7, 16
+; RV32I-WITHFP-NEXT:    li t0, 15
+; RV32I-WITHFP-NEXT:    lui t1, 262153
+; RV32I-WITHFP-NEXT:    lui t2, 545260
+; RV32I-WITHFP-NEXT:    lui t3, 964690
+; RV32I-WITHFP-NEXT:    lui t4, 335544
+; RV32I-WITHFP-NEXT:    lui t5, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
+; RV32I-WITHFP-NEXT:    sw a5, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a4, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a4, 13
-; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
-; RV32I-WITHFP-NEXT:    li a6, 4
-; RV32I-WITHFP-NEXT:    sw t1, 0(sp)
-; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    sw t0, 0(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 8(sp)
 ; RV32I-WITHFP-NEXT:    sw zero, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
+; RV32I-WITHFP-NEXT:    sw a6, 16(sp)
+; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    addi a7, t1, 491
+; RV32I-WITHFP-NEXT:    addi t0, t2, -1967
+; RV32I-WITHFP-NEXT:    addi t1, t3, -328
+; RV32I-WITHFP-NEXT:    addi t2, t4, 1311
+; RV32I-WITHFP-NEXT:    addi a5, t5, -2048
+; RV32I-WITHFP-NEXT:    sw t2, -32(s0)
+; RV32I-WITHFP-NEXT:    sw t1, -28(s0)
+; RV32I-WITHFP-NEXT:    sw t0, -24(s0)
+; RV32I-WITHFP-NEXT:    sw a7, -20(s0)
 ; RV32I-WITHFP-NEXT:    li a7, 14
-; RV32I-WITHFP-NEXT:    addi t0, t2, 491
-; RV32I-WITHFP-NEXT:    addi t1, t3, -1967
-; RV32I-WITHFP-NEXT:    addi t2, t4, -328
-; RV32I-WITHFP-NEXT:    addi t3, t5, 1311
-; RV32I-WITHFP-NEXT:    addi a5, t6, -2048
-; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
-; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
-; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
-; RV32I-WITHFP-NEXT:    sw t0, -20(s0)
 ; RV32I-WITHFP-NEXT:    call callee_aligned_stack
 ; RV32I-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll
index 1dac139503ba7..5e37c83d30ba8 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll
@@ -111,8 +111,8 @@ define i32 @caller_float_on_stack() nounwind {
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a2, 2
 ; RV32I-FPELIM-NEXT:    li a4, 3
-; RV32I-FPELIM-NEXT:    li a6, 4
 ; RV32I-FPELIM-NEXT:    sw a1, 0(sp)
+; RV32I-FPELIM-NEXT:    li a6, 4
 ; RV32I-FPELIM-NEXT:    li a1, 0
 ; RV32I-FPELIM-NEXT:    li a3, 0
 ; RV32I-FPELIM-NEXT:    li a5, 0
@@ -132,8 +132,8 @@ define i32 @caller_float_on_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a2, 2
 ; RV32I-WITHFP-NEXT:    li a4, 3
-; RV32I-WITHFP-NEXT:    li a6, 4
 ; RV32I-WITHFP-NEXT:    sw a1, 0(sp)
+; RV32I-WITHFP-NEXT:    li a6, 4
 ; RV32I-WITHFP-NEXT:    li a1, 0
 ; RV32I-WITHFP-NEXT:    li a3, 0
 ; RV32I-WITHFP-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
index 7630d5b8f77ef..3ae76de6a65f7 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
@@ -51,14 +51,14 @@ define i32 @caller_double_in_fpr_exhausted_gprs() nounwind {
 ; RV32-ILP32D:       # %bb.0:
 ; RV32-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32D-NEXT:    li a1, 5
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI3_0)(a0)
+; RV32-ILP32D-NEXT:    li a0, 5
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI3_0)
+; RV32-ILP32D-NEXT:    sw a0, 0(sp)
+; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI3_0)(a1)
 ; RV32-ILP32D-NEXT:    li a0, 1
 ; RV32-ILP32D-NEXT:    li a2, 2
 ; RV32-ILP32D-NEXT:    li a4, 3
 ; RV32-ILP32D-NEXT:    li a6, 4
-; RV32-ILP32D-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32D-NEXT:    li a1, 0
 ; RV32-ILP32D-NEXT:    li a3, 0
 ; RV32-ILP32D-NEXT:    li a5, 0
@@ -147,16 +147,17 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind
 ; RV32-ILP32D:       # %bb.0:
 ; RV32-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32D-NEXT:    lui a1, 262816
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_0)
+; RV32-ILP32D-NEXT:    lui a0, 262816
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI7_0)
 ; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI7_1)
 ; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI7_2)
 ; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI7_3)
 ; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI7_4)
 ; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI7_5)
 ; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI7_6)
-; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI7_0)(a0)
+; RV32-ILP32D-NEXT:    sw a0, 0(sp)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_7)
+; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI7_0)(a1)
 ; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI7_1)(a2)
 ; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI7_2)(a3)
 ; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI7_3)(a4)
@@ -168,7 +169,6 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind
 ; RV32-ILP32D-NEXT:    li a2, 3
 ; RV32-ILP32D-NEXT:    li a4, 5
 ; RV32-ILP32D-NEXT:    li a6, 7
-; RV32-ILP32D-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32D-NEXT:    li a1, 0
 ; RV32-ILP32D-NEXT:    li a3, 0
 ; RV32-ILP32D-NEXT:    li a5, 0
@@ -203,29 +203,29 @@ define i32 @caller_double_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32D:       # %bb.0:
 ; RV32-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32D-NEXT:    lui a1, 262816
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_0)
+; RV32-ILP32D-NEXT:    lui a0, 262816
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI9_0)
 ; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI9_1)
 ; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI9_2)
 ; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI9_3)
 ; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI9_4)
 ; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI9_5)
 ; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI9_6)
-; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI9_0)(a0)
-; RV32-ILP32D-NEXT:    lui t0, %hi(.LCPI9_7)
+; RV32-ILP32D-NEXT:    sw zero, 0(sp)
+; RV32-ILP32D-NEXT:    sw a0, 4(sp)
+; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_7)
+; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI9_0)(a1)
 ; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI9_1)(a2)
-; RV32-ILP32D-NEXT:    li a0, 1
 ; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI9_2)(a3)
 ; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI9_3)(a4)
 ; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI9_4)(a5)
 ; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI9_5)(a6)
 ; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI9_6)(a7)
-; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(t0)
+; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(a0)
+; RV32-ILP32D-NEXT:    li a0, 1
 ; RV32-ILP32D-NEXT:    li a2, 3
 ; RV32-ILP32D-NEXT:    li a4, 5
 ; RV32-ILP32D-NEXT:    li a6, 7
-; RV32-ILP32D-NEXT:    sw zero, 0(sp)
-; RV32-ILP32D-NEXT:    sw a1, 4(sp)
 ; RV32-ILP32D-NEXT:    li a1, 0
 ; RV32-ILP32D-NEXT:    li a3, 0
 ; RV32-ILP32D-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index e16bed5400300..51def89ed6c3a 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -224,10 +224,10 @@ define i32 @caller_float_on_stack() {
 ; ILP32E-FPELIM-NEXT:    li a3, 4
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a2, 2
-; ILP32E-FPELIM-NEXT:    li a4, 3
 ; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-NEXT:    li a4, 3
 ; ILP32E-FPELIM-NEXT:    li a1, 0
 ; ILP32E-FPELIM-NEXT:    li a3, 0
 ; ILP32E-FPELIM-NEXT:    li a5, 0
@@ -252,10 +252,10 @@ define i32 @caller_float_on_stack() {
 ; ILP32E-WITHFP-NEXT:    li a3, 4
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a2, 2
-; ILP32E-WITHFP-NEXT:    li a4, 3
 ; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
+; ILP32E-WITHFP-NEXT:    li a4, 3
 ; ILP32E-WITHFP-NEXT:    li a1, 0
 ; ILP32E-WITHFP-NEXT:    li a3, 0
 ; ILP32E-WITHFP-NEXT:    li a5, 0
@@ -280,10 +280,10 @@ define i32 @caller_float_on_stack() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 0
@@ -306,10 +306,10 @@ define i32 @caller_float_on_stack() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 0
@@ -589,16 +589,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-FPELIM-LABEL: callee_aligned_stack:
 ; ILP32E-FPELIM:       # %bb.0:
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a2)
-; ILP32E-FPELIM-NEXT:    lw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    lw a2, 4(sp)
+; ILP32E-FPELIM-NEXT:    lw a1, 24(sp)
+; ILP32E-FPELIM-NEXT:    lw a2, 12(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 8(sp)
-; ILP32E-FPELIM-NEXT:    lw a4, 24(sp)
+; ILP32E-FPELIM-NEXT:    lw a4, 4(sp)
 ; ILP32E-FPELIM-NEXT:    lw a5, 20(sp)
+; ILP32E-FPELIM-NEXT:    add a0, a0, a4
+; ILP32E-FPELIM-NEXT:    add a2, a3, a2
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a2
-; ILP32E-FPELIM-NEXT:    add a1, a3, a1
+; ILP32E-FPELIM-NEXT:    add a1, a5, a1
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a1
-; ILP32E-FPELIM-NEXT:    add a4, a5, a4
-; ILP32E-FPELIM-NEXT:    add a0, a0, a4
 ; ILP32E-FPELIM-NEXT:    ret
 ;
 ; ILP32E-WITHFP-LABEL: callee_aligned_stack:
@@ -612,16 +612,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a2)
-; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
-; ILP32E-WITHFP-NEXT:    lw a2, 4(s0)
+; ILP32E-WITHFP-NEXT:    lw a1, 24(s0)
+; ILP32E-WITHFP-NEXT:    lw a2, 12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a3, 8(s0)
-; ILP32E-WITHFP-NEXT:    lw a4, 24(s0)
+; ILP32E-WITHFP-NEXT:    lw a4, 4(s0)
 ; ILP32E-WITHFP-NEXT:    lw a5, 20(s0)
+; ILP32E-WITHFP-NEXT:    add a0, a0, a4
+; ILP32E-WITHFP-NEXT:    add a2, a3, a2
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a2
-; ILP32E-WITHFP-NEXT:    add a1, a3, a1
+; ILP32E-WITHFP-NEXT:    add a1, a5, a1
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
-; ILP32E-WITHFP-NEXT:    add a4, a5, a4
-; ILP32E-WITHFP-NEXT:    add a0, a0, a4
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 0(sp) # 4-byte Folded Reload
@@ -634,16 +634,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_aligned_stack:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 0(a2)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a2, a3, a2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a1, a3, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a1, a5, a1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a4, a5, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a4
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
 ; ILP32E-WITHFP-SAVE-RESTORE-LABEL: callee_aligned_stack:
@@ -655,16 +655,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 0(a2)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(s0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 4(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 24(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 12(s0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 8(s0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 24(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 4(s0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 20(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a2, a3, a2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a1, a3, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a1, a5, a1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a4, a5, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a4
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
   %1 = bitcast fp128 %c to i128
@@ -694,43 +694,43 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    li a3, 18
-; ILP32E-FPELIM-NEXT:    li a4, 17
-; ILP32E-FPELIM-NEXT:    li a5, 16
-; ILP32E-FPELIM-NEXT:    lui a6, 262236
-; ILP32E-FPELIM-NEXT:    lui a7, 377487
-; ILP32E-FPELIM-NEXT:    li t0, 15
-; ILP32E-FPELIM-NEXT:    li t1, 14
-; ILP32E-FPELIM-NEXT:    li t2, 4
-; ILP32E-FPELIM-NEXT:    lui t3, 262153
-; ILP32E-FPELIM-NEXT:    lui t4, 545260
-; ILP32E-FPELIM-NEXT:    lui t5, 964690
-; ILP32E-FPELIM-NEXT:    lui t6, 335544
-; ILP32E-FPELIM-NEXT:    lui s2, 688509
+; ILP32E-FPELIM-NEXT:    li a2, 18
+; ILP32E-FPELIM-NEXT:    li a3, 17
+; ILP32E-FPELIM-NEXT:    li a4, 16
+; ILP32E-FPELIM-NEXT:    lui a5, 262236
+; ILP32E-FPELIM-NEXT:    lui a6, 377487
+; ILP32E-FPELIM-NEXT:    li a7, 15
+; ILP32E-FPELIM-NEXT:    li t0, 14
+; ILP32E-FPELIM-NEXT:    li t1, 4
+; ILP32E-FPELIM-NEXT:    lui t2, 262153
+; ILP32E-FPELIM-NEXT:    lui t3, 545260
+; ILP32E-FPELIM-NEXT:    lui t4, 964690
+; ILP32E-FPELIM-NEXT:    lui t5, 335544
+; ILP32E-FPELIM-NEXT:    lui t6, 688509
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 11
+; ILP32E-FPELIM-NEXT:    addi a5, a5, 655
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 28(sp)
 ; ILP32E-FPELIM-NEXT:    addi a2, sp, 32
-; ILP32E-FPELIM-NEXT:    addi a6, a6, 655
-; ILP32E-FPELIM-NEXT:    sw a6, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a5, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 28(sp)
+; ILP32E-FPELIM-NEXT:    addi a3, a6, 1475
+; ILP32E-FPELIM-NEXT:    sw t1, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a7, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 12
-; ILP32E-FPELIM-NEXT:    addi a4, a7, 1475
-; ILP32E-FPELIM-NEXT:    sw t2, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw t1, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw t0, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    addi a4, t2, 491
+; ILP32E-FPELIM-NEXT:    addi a6, t3, -1967
+; ILP32E-FPELIM-NEXT:    addi a7, t4, -328
+; ILP32E-FPELIM-NEXT:    addi t0, t5, 1311
+; ILP32E-FPELIM-NEXT:    addi a5, t6, -2048
+; ILP32E-FPELIM-NEXT:    sw t0, 32(sp)
+; ILP32E-FPELIM-NEXT:    sw a7, 36(sp)
+; ILP32E-FPELIM-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 44(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 13
-; ILP32E-FPELIM-NEXT:    addi a6, t3, 491
-; ILP32E-FPELIM-NEXT:    addi a7, t4, -1967
-; ILP32E-FPELIM-NEXT:    addi t0, t5, -328
-; ILP32E-FPELIM-NEXT:    addi t1, t6, 1311
-; ILP32E-FPELIM-NEXT:    addi a5, s2, -2048
-; ILP32E-FPELIM-NEXT:    sw t1, 32(sp)
-; ILP32E-FPELIM-NEXT:    sw t0, 36(sp)
-; ILP32E-FPELIM-NEXT:    sw a7, 40(sp)
-; ILP32E-FPELIM-NEXT:    sw a6, 44(sp)
 ; ILP32E-FPELIM-NEXT:    call callee_aligned_stack
 ; ILP32E-FPELIM-NEXT:    addi sp, s0, -64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa sp, 64
@@ -753,43 +753,43 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    li a3, 18
-; ILP32E-WITHFP-NEXT:    li a4, 17
-; ILP32E-WITHFP-NEXT:    li a5, 16
-; ILP32E-WITHFP-NEXT:    lui a6, 262236
-; ILP32E-WITHFP-NEXT:    lui a7, 377487
-; ILP32E-WITHFP-NEXT:    li t0, 15
-; ILP32E-WITHFP-NEXT:    li t1, 14
-; ILP32E-WITHFP-NEXT:    li t2, 4
-; ILP32E-WITHFP-NEXT:    lui t3, 262153
-; ILP32E-WITHFP-NEXT:    lui t4, 545260
-; ILP32E-WITHFP-NEXT:    lui t5, 964690
-; ILP32E-WITHFP-NEXT:    lui t6, 335544
-; ILP32E-WITHFP-NEXT:    lui s2, 688509
+; ILP32E-WITHFP-NEXT:    li a2, 18
+; ILP32E-WITHFP-NEXT:    li a3, 17
+; ILP32E-WITHFP-NEXT:    li a4, 16
+; ILP32E-WITHFP-NEXT:    lui a5, 262236
+; ILP32E-WITHFP-NEXT:    lui a6, 377487
+; ILP32E-WITHFP-NEXT:    li a7, 15
+; ILP32E-WITHFP-NEXT:    li t0, 14
+; ILP32E-WITHFP-NEXT:    li t1, 4
+; ILP32E-WITHFP-NEXT:    lui t2, 262153
+; ILP32E-WITHFP-NEXT:    lui t3, 545260
+; ILP32E-WITHFP-NEXT:    lui t4, 964690
+; ILP32E-WITHFP-NEXT:    lui t5, 335544
+; ILP32E-WITHFP-NEXT:    lui t6, 688509
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 11
+; ILP32E-WITHFP-NEXT:    addi a5, a5, 655
+; ILP32E-WITHFP-NEXT:    sw a5, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw a2, 28(sp)
 ; ILP32E-WITHFP-NEXT:    addi a2, sp, 32
-; ILP32E-WITHFP-NEXT:    addi a6, a6, 655
-; ILP32E-WITHFP-NEXT:    sw a6, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw a5, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw a4, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw a3, 28(sp)
+; ILP32E-WITHFP-NEXT:    addi a3, a6, 1475
+; ILP32E-WITHFP-NEXT:    sw t1, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a7, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 12
-; ILP32E-WITHFP-NEXT:    addi a4, a7, 1475
-; ILP32E-WITHFP-NEXT:    sw t2, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw t1, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw t0, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
+; ILP32E-WITHFP-NEXT:    addi a4, t2, 491
+; ILP32E-WITHFP-NEXT:    addi a6, t3, -1967
+; ILP32E-WITHFP-NEXT:    addi a7, t4, -328
+; ILP32E-WITHFP-NEXT:    addi t0, t5, 1311
+; ILP32E-WITHFP-NEXT:    addi a5, t6, -2048
+; ILP32E-WITHFP-NEXT:    sw t0, 32(sp)
+; ILP32E-WITHFP-NEXT:    sw a7, 36(sp)
+; ILP32E-WITHFP-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 44(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 13
-; ILP32E-WITHFP-NEXT:    addi a6, t3, 491
-; ILP32E-WITHFP-NEXT:    addi a7, t4, -1967
-; ILP32E-WITHFP-NEXT:    addi t0, t5, -328
-; ILP32E-WITHFP-NEXT:    addi t1, t6, 1311
-; ILP32E-WITHFP-NEXT:    addi a5, s2, -2048
-; ILP32E-WITHFP-NEXT:    sw t1, 32(sp)
-; ILP32E-WITHFP-NEXT:    sw t0, 36(sp)
-; ILP32E-WITHFP-NEXT:    sw a7, 40(sp)
-; ILP32E-WITHFP-NEXT:    sw a6, 44(sp)
 ; ILP32E-WITHFP-NEXT:    call callee_aligned_stack
 ; ILP32E-WITHFP-NEXT:    addi sp, s0, -64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 64
@@ -812,43 +812,43 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 18
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 17
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a6, 262236
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a7, 377487
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t0, 15
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 14
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t2, 4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t3, 262153
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t4, 545260
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t5, 964690
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t6, 335544
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui s2, 688509
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 18
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 17
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a5, 262236
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a6, 377487
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a7, 15
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t0, 14
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t2, 262153
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t3, 545260
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t4, 964690
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t5, 335544
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t6, 688509
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 11
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, a5, 655
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a2, sp, 32
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, a6, 655
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a3, a6, 1475
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 12
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, t2, 491
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, t3, -1967
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, t4, -328
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, t5, 1311
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, t6, -2048
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 32(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 36(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 44(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 13
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, t3, 491
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, t5, -328
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 44(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    call callee_aligned_stack
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi sp, s0, -64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 64
@@ -867,43 +867,43 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 18
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 17
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a6, 262236
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a7, 377487
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t0, 15
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 14
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t2, 4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t3, 262153
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t4, 545260
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t5, 964690
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t6, 335544
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui s2, 688509
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 18
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 17
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a5, 262236
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a6, 377487
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a7, 15
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t0, 14
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t2, 262153
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t3, 545260
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t4, 964690
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t5, 335544
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t6, 688509
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 11
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, a5, 655
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 16(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 20(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a2, sp, 32
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, a6, 655
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a3, a6, 1475
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 12
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, t2, 491
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, t3, -1967
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, t4, -328
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, t5, 1311
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, t6, -2048
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 32(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 36(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 44(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 13
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, t3, 491
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, t5, -328
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 44(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    call callee_aligned_stack
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi sp, s0, -64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 64
@@ -1272,17 +1272,17 @@ define i32 @caller_many_scalars() {
 ; ILP32E-FPELIM-NEXT:    sw ra, 16(sp) # 4-byte Folded Spill
 ; ILP32E-FPELIM-NEXT:    .cfi_offset ra, -4
 ; ILP32E-FPELIM-NEXT:    li a4, 8
-; ILP32E-FPELIM-NEXT:    li a6, 7
-; ILP32E-FPELIM-NEXT:    li a7, 6
+; ILP32E-FPELIM-NEXT:    li a5, 7
+; ILP32E-FPELIM-NEXT:    li a6, 6
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 2
 ; ILP32E-FPELIM-NEXT:    li a2, 3
 ; ILP32E-FPELIM-NEXT:    li a3, 4
-; ILP32E-FPELIM-NEXT:    li a5, 5
-; ILP32E-FPELIM-NEXT:    sw a7, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a6, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 4(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 8(sp)
 ; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    li a5, 5
 ; ILP32E-FPELIM-NEXT:    li a4, 0
 ; ILP32E-FPELIM-NEXT:    call callee_many_scalars
 ; ILP32E-FPELIM-NEXT:    lw ra, 16(sp) # 4-byte Folded Reload
@@ -1302,17 +1302,17 @@ define i32 @caller_many_scalars() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 24
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    li a4, 8
-; ILP32E-WITHFP-NEXT:    li a6, 7
-; ILP32E-WITHFP-NEXT:    li a7, 6
+; ILP32E-WITHFP-NEXT:    li a5, 7
+; ILP32E-WITHFP-NEXT:    li a6, 6
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 2
 ; ILP32E-WITHFP-NEXT:    li a2, 3
 ; ILP32E-WITHFP-NEXT:    li a3, 4
-; ILP32E-WITHFP-NEXT:    li a5, 5
-; ILP32E-WITHFP-NEXT:    sw a7, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a6, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 4(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(sp)
 ; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
+; ILP32E-WITHFP-NEXT:    li a5, 5
 ; ILP32E-WITHFP-NEXT:    li a4, 0
 ; ILP32E-WITHFP-NEXT:    call callee_many_scalars
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 24
@@ -1332,17 +1332,17 @@ define i32 @caller_many_scalars() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi sp, sp, -16
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa_offset 20
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 8
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a6, 7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a7, 6
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a6, 6
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 5
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    call callee_many_scalars
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi sp, sp, 16
@@ -1360,17 +1360,17 @@ define i32 @caller_many_scalars() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 24
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 8
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a6, 7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a7, 6
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a6, 6
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 4(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 5
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    call callee_many_scalars
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 24
@@ -1390,17 +1390,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a1)
 ; ILP32E-FPELIM-NEXT:    lw a4, 8(a1)
 ; ILP32E-FPELIM-NEXT:    lw a1, 12(a1)
-; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
+; ILP32E-FPELIM-NEXT:    lw a5, 0(a0)
 ; ILP32E-FPELIM-NEXT:    lw a6, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a7, 8(a0)
-; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-NEXT:    xor a1, a5, a1
-; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
-; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-NEXT:    xor a0, a0, a2
-; ILP32E-FPELIM-NEXT:    or a1, a3, a1
-; ILP32E-FPELIM-NEXT:    or a0, a0, a4
-; ILP32E-FPELIM-NEXT:    or a0, a0, a1
+; ILP32E-FPELIM-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
+; ILP32E-FPELIM-NEXT:    xor a1, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a3, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a2, a5, a2
+; ILP32E-FPELIM-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-NEXT:    or a2, a2, a3
+; ILP32E-FPELIM-NEXT:    or a0, a2, a0
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1418,17 +1418,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    lw a3, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw a4, 8(a1)
 ; ILP32E-WITHFP-NEXT:    lw a1, 12(a1)
-; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
+; ILP32E-WITHFP-NEXT:    lw a5, 0(a0)
 ; ILP32E-WITHFP-NEXT:    lw a6, 4(a0)
 ; ILP32E-WITHFP-NEXT:    lw a7, 8(a0)
-; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-NEXT:    xor a1, a5, a1
-; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
-; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-NEXT:    xor a0, a0, a2
-; ILP32E-WITHFP-NEXT:    or a1, a3, a1
-; ILP32E-WITHFP-NEXT:    or a0, a0, a4
-; ILP32E-WITHFP-NEXT:    or a0, a0, a1
+; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
+; ILP32E-WITHFP-NEXT:    xor a1, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a3, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a2, a5, a2
+; ILP32E-WITHFP-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-NEXT:    or a2, a2, a3
+; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1445,17 +1445,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 0(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a5, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a3, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a5, a2
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a2, a2, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a2, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1471,17 +1471,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 0(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a5, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a3, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a5, a2
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a2, a2, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1503,18 +1503,18 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    lui a1, 524272
-; ILP32E-FPELIM-NEXT:    li a2, 1
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-NEXT:    li a1, 1
 ; ILP32E-FPELIM-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-NEXT:    mv a1, sp
-; ILP32E-FPELIM-NEXT:    sw a2, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-NEXT:    sw a1, 24(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 28(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 32(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 36(sp)
+; ILP32E-FPELIM-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-NEXT:    call callee_large_scalars
 ; ILP32E-FPELIM-NEXT:    addi sp, s0, -48
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa sp, 48
@@ -1537,18 +1537,18 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    lui a1, 524272
-; ILP32E-WITHFP-NEXT:    li a2, 1
-; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-NEXT:    li a1, 1
 ; ILP32E-WITHFP-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 12(sp)
-; ILP32E-WITHFP-NEXT:    mv a1, sp
-; ILP32E-WITHFP-NEXT:    sw a2, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
+; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-NEXT:    sw a1, 24(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 28(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 32(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 36(sp)
+; ILP32E-WITHFP-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-NEXT:    call callee_large_scalars
 ; ILP32E-WITHFP-NEXT:    addi sp, s0, -48
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 48
@@ -1571,18 +1571,18 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a1, 524272
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    mv a1, sp
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 32(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 36(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    call callee_large_scalars
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi sp, s0, -48
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 48
@@ -1601,18 +1601,18 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a1, 524272
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    mv a1, sp
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 32(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 36(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    call callee_large_scalars
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi sp, s0, -48
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 48
@@ -1636,17 +1636,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
 ; ILP32E-FPELIM-NEXT:    lw a4, 8(a0)
 ; ILP32E-FPELIM-NEXT:    lw a0, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a5, 0(a1)
 ; ILP32E-FPELIM-NEXT:    lw a6, 4(a1)
 ; ILP32E-FPELIM-NEXT:    lw a7, 8(a1)
-; ILP32E-FPELIM-NEXT:    lw a1, 0(a1)
-; ILP32E-FPELIM-NEXT:    xor a0, a5, a0
-; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
-; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-NEXT:    xor a1, a1, a2
-; ILP32E-FPELIM-NEXT:    or a0, a3, a0
-; ILP32E-FPELIM-NEXT:    or a1, a1, a4
+; ILP32E-FPELIM-NEXT:    lw a1, 12(a1)
+; ILP32E-FPELIM-NEXT:    xor a0, a1, a0
+; ILP32E-FPELIM-NEXT:    xor a1, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a3, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a2, a5, a2
 ; ILP32E-FPELIM-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-NEXT:    or a2, a2, a3
+; ILP32E-FPELIM-NEXT:    or a0, a2, a0
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1666,17 +1666,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
 ; ILP32E-WITHFP-NEXT:    lw a4, 8(a0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a5, 0(a1)
 ; ILP32E-WITHFP-NEXT:    lw a6, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw a7, 8(a1)
-; ILP32E-WITHFP-NEXT:    lw a1, 0(a1)
-; ILP32E-WITHFP-NEXT:    xor a0, a5, a0
-; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
-; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-NEXT:    xor a1, a1, a2
-; ILP32E-WITHFP-NEXT:    or a0, a3, a0
-; ILP32E-WITHFP-NEXT:    or a1, a1, a4
+; ILP32E-WITHFP-NEXT:    lw a1, 12(a1)
+; ILP32E-WITHFP-NEXT:    xor a0, a1, a0
+; ILP32E-WITHFP-NEXT:    xor a1, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a3, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a2, a5, a2
 ; ILP32E-WITHFP-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-NEXT:    or a2, a2, a3
+; ILP32E-WITHFP-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1695,17 +1695,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 0(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a5, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a1, a2
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a3, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a1, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a1, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a5, a2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a2, a2, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a2, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1723,17 +1723,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 0(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a5, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a1, a2
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a3, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a1, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a1, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a5, a2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a2, a2, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a2, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1755,30 +1755,30 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    addi a4, sp, 16
-; ILP32E-FPELIM-NEXT:    li a5, 9
-; ILP32E-FPELIM-NEXT:    addi a6, sp, 40
-; ILP32E-FPELIM-NEXT:    li a7, 7
-; ILP32E-FPELIM-NEXT:    lui t0, 524272
-; ILP32E-FPELIM-NEXT:    li t1, 8
+; ILP32E-FPELIM-NEXT:    addi a3, sp, 16
+; ILP32E-FPELIM-NEXT:    li a4, 9
+; ILP32E-FPELIM-NEXT:    addi a5, sp, 40
+; ILP32E-FPELIM-NEXT:    li a6, 7
+; ILP32E-FPELIM-NEXT:    lui a7, 524272
+; ILP32E-FPELIM-NEXT:    li t0, 8
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 2
 ; ILP32E-FPELIM-NEXT:    li a2, 3
+; ILP32E-FPELIM-NEXT:    sw a6, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 4
-; ILP32E-FPELIM-NEXT:    sw a7, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a6, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a5, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
-; ILP32E-FPELIM-NEXT:    li a4, 5
 ; ILP32E-FPELIM-NEXT:    sw zero, 16(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 20(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw t0, 28(sp)
-; ILP32E-FPELIM-NEXT:    li a5, 6
-; ILP32E-FPELIM-NEXT:    sw t1, 40(sp)
+; ILP32E-FPELIM-NEXT:    sw a7, 28(sp)
+; ILP32E-FPELIM-NEXT:    li a4, 5
+; ILP32E-FPELIM-NEXT:    sw t0, 40(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 52(sp)
+; ILP32E-FPELIM-NEXT:    li a5, 6
 ; ILP32E-FPELIM-NEXT:    call callee_large_scalars_exhausted_regs
 ; ILP32E-FPELIM-NEXT:    addi sp, s0, -64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa sp, 64
@@ -1801,30 +1801,30 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    addi a4, sp, 16
-; ILP32E-WITHFP-NEXT:    li a5, 9
-; ILP32E-WITHFP-NEXT:    addi a6, sp, 40
-; ILP32E-WITHFP-NEXT:    li a7, 7
-; ILP32E-WITHFP-NEXT:    lui t0, 524272
-; ILP32E-WITHFP-NEXT:    li t1, 8
+; ILP32E-WITHFP-NEXT:    addi a3, sp, 16
+; ILP32E-WITHFP-NEXT:    li a4, 9
+; ILP32E-WITHFP-NEXT:    addi a5, sp, 40
+; ILP32E-WITHFP-NEXT:    li a6, 7
+; ILP32E-WITHFP-NEXT:    lui a7, 524272
+; ILP32E-WITHFP-NEXT:    li t0, 8
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 2
 ; ILP32E-WITHFP-NEXT:    li a2, 3
+; ILP32E-WITHFP-NEXT:    sw a6, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 4
-; ILP32E-WITHFP-NEXT:    sw a7, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a6, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a5, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
-; ILP32E-WITHFP-NEXT:    li a4, 5
 ; ILP32E-WITHFP-NEXT:    sw zero, 16(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 20(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw t0, 28(sp)
-; ILP32E-WITHFP-NEXT:    li a5, 6
-; ILP32E-WITHFP-NEXT:    sw t1, 40(sp)
+; ILP32E-WITHFP-NEXT:    sw a7, 28(sp)
+; ILP32E-WITHFP-NEXT:    li a4, 5
+; ILP32E-WITHFP-NEXT:    sw t0, 40(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 52(sp)
+; ILP32E-WITHFP-NEXT:    li a5, 6
 ; ILP32E-WITHFP-NEXT:    call callee_large_scalars_exhausted_regs
 ; ILP32E-WITHFP-NEXT:    addi sp, s0, -64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 64
@@ -1847,30 +1847,30 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, sp, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 9
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, sp, 40
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a7, 7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t0, 524272
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 8
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a3, sp, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 9
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, sp, 40
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a6, 7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a7, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t0, 8
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 5
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 28(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 40(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 52(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 6
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    call callee_large_scalars_exhausted_regs
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi sp, s0, -64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 64
@@ -1889,30 +1889,30 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, sp, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 9
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, sp, 40
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a7, 7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t0, 524272
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 8
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a3, sp, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 9
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, sp, 40
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a6, 7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a7, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t0, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 5
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 28(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 40(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 52(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 6
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    call callee_large_scalars_exhausted_regs
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi sp, s0, -64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 64
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
index dabd2a7ce9a73..cb98422ebd3ae 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
@@ -59,9 +59,9 @@ define i32 @caller_float_in_fpr_exhausted_gprs() nounwind {
 ; RV32-ILP32FD-NEXT:    li a0, 1
 ; RV32-ILP32FD-NEXT:    li a2, 2
 ; RV32-ILP32FD-NEXT:    li a4, 3
+; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a3
 ; RV32-ILP32FD-NEXT:    li a6, 4
-; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0
 ; RV32-ILP32FD-NEXT:    li a3, 0
 ; RV32-ILP32FD-NEXT:    li a5, 0
@@ -141,28 +141,28 @@ define i32 @caller_float_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32FD:       # %bb.0:
 ; RV32-ILP32FD-NEXT:    addi sp, sp, -16
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32FD-NEXT:    lui a1, 267520
-; RV32-ILP32FD-NEXT:    lui a0, 262144
+; RV32-ILP32FD-NEXT:    lui a0, 267520
+; RV32-ILP32FD-NEXT:    lui a1, 262144
 ; RV32-ILP32FD-NEXT:    lui a2, 264192
 ; RV32-ILP32FD-NEXT:    lui a3, 265216
-; RV32-ILP32FD-NEXT:    lui a4, 266240
-; RV32-ILP32FD-NEXT:    lui a5, 266496
-; RV32-ILP32FD-NEXT:    lui a6, 266752
-; RV32-ILP32FD-NEXT:    lui a7, 267008
-; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
-; RV32-ILP32FD-NEXT:    lui t0, 267264
-; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a2
+; RV32-ILP32FD-NEXT:    lui a5, 266240
+; RV32-ILP32FD-NEXT:    lui a6, 266496
+; RV32-ILP32FD-NEXT:    lui a7, 266752
+; RV32-ILP32FD-NEXT:    lui t0, 267008
+; RV32-ILP32FD-NEXT:    sw a0, 0(sp)
+; RV32-ILP32FD-NEXT:    lui t1, 267264
+; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a1
 ; RV32-ILP32FD-NEXT:    li a0, 1
-; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a3
+; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a2
 ; RV32-ILP32FD-NEXT:    li a2, 3
-; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a4
+; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a3
 ; RV32-ILP32FD-NEXT:    li a4, 5
-; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a5
-; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a6
-; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a7
-; RV32-ILP32FD-NEXT:    fmv.w.x fa7, t0
+; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a5
+; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a6
+; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a7
+; RV32-ILP32FD-NEXT:    fmv.w.x fa6, t0
+; RV32-ILP32FD-NEXT:    fmv.w.x fa7, t1
 ; RV32-ILP32FD-NEXT:    li a6, 7
-; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0
 ; RV32-ILP32FD-NEXT:    li a3, 0
 ; RV32-ILP32FD-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index 746b71a08a30b..219fca5e48c52 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -89,9 +89,9 @@ define i32 @caller_many_scalars() nounwind {
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a5, 5
 ; RV64I-NEXT:    li a6, 6
-; RV64I-NEXT:    li a7, 7
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    li a7, 7
 ; RV64I-NEXT:    li a4, 0
 ; RV64I-NEXT:    call callee_many_scalars
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -110,17 +110,17 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-NEXT:    ld a3, 8(a1)
 ; RV64I-NEXT:    ld a4, 16(a1)
 ; RV64I-NEXT:    ld a1, 24(a1)
-; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a5, 0(a0)
 ; RV64I-NEXT:    ld a6, 8(a0)
 ; RV64I-NEXT:    ld a7, 16(a0)
-; RV64I-NEXT:    ld a0, 0(a0)
-; RV64I-NEXT:    xor a1, a5, a1
-; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    xor a1, a6, a3
+; RV64I-NEXT:    xor a3, a7, a4
+; RV64I-NEXT:    xor a2, a5, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -133,18 +133,18 @@ define i64 @caller_large_scalars() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -80
 ; RV64I-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a2, 2
-; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    addi a0, sp, 32
-; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    sd a2, 0(sp)
+; RV64I-NEXT:    sd a1, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    sd a2, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    call callee_large_scalars
 ; RV64I-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 80
@@ -165,17 +165,17 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-NEXT:    ld a2, 8(a7)
 ; RV64I-NEXT:    ld a3, 16(a7)
 ; RV64I-NEXT:    ld a4, 24(a7)
-; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a5, 0(a0)
 ; RV64I-NEXT:    ld a6, 8(a0)
 ; RV64I-NEXT:    ld a7, 16(a0)
-; RV64I-NEXT:    ld a0, 0(a0)
-; RV64I-NEXT:    xor a4, a4, a5
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xor a0, a4, a0
 ; RV64I-NEXT:    xor a2, a2, a6
 ; RV64I-NEXT:    xor a3, a3, a7
-; RV64I-NEXT:    xor a0, a1, a0
-; RV64I-NEXT:    or a2, a2, a4
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    xor a1, a1, a5
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
@@ -188,28 +188,28 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -96
 ; RV64I-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a7, sp, 16
-; RV64I-NEXT:    li t0, 9
-; RV64I-NEXT:    li t1, 10
-; RV64I-NEXT:    li t2, 8
+; RV64I-NEXT:    addi a6, sp, 16
+; RV64I-NEXT:    li a7, 9
+; RV64I-NEXT:    li t0, 10
+; RV64I-NEXT:    li t1, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
+; RV64I-NEXT:    sd a7, 0(sp)
+; RV64I-NEXT:    sd a6, 8(sp)
 ; RV64I-NEXT:    li a6, 7
-; RV64I-NEXT:    sd t0, 0(sp)
-; RV64I-NEXT:    sd a7, 8(sp)
-; RV64I-NEXT:    addi a7, sp, 48
-; RV64I-NEXT:    sd t1, 16(sp)
+; RV64I-NEXT:    sd t0, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
-; RV64I-NEXT:    sd t2, 48(sp)
+; RV64I-NEXT:    sd t1, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
 ; RV64I-NEXT:    sd zero, 64(sp)
 ; RV64I-NEXT:    sd zero, 72(sp)
+; RV64I-NEXT:    addi a7, sp, 48
 ; RV64I-NEXT:    call callee_large_scalars_exhausted_regs
 ; RV64I-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 96
@@ -329,13 +329,13 @@ define i64 @callee_aligned_stack(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i128 %f
 ; RV64I-LABEL: callee_aligned_stack:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 32(sp)
-; RV64I-NEXT:    ld a1, 0(sp)
-; RV64I-NEXT:    ld a2, 16(sp)
+; RV64I-NEXT:    ld a1, 16(sp)
+; RV64I-NEXT:    ld a2, 0(sp)
 ; RV64I-NEXT:    ld a3, 40(sp)
 ; RV64I-NEXT:    add a5, a5, a7
-; RV64I-NEXT:    add a1, a5, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    add a0, a0, a3
 ; RV64I-NEXT:    ret
   %f_trunc = trunc i128 %f to i64
@@ -356,24 +356,24 @@ define void @caller_aligned_stack() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a6, 12
-; RV64I-NEXT:    li a7, 11
-; RV64I-NEXT:    li t0, 10
-; RV64I-NEXT:    li t1, 9
-; RV64I-NEXT:    li t2, 8
+; RV64I-NEXT:    li a5, 12
+; RV64I-NEXT:    li a6, 11
+; RV64I-NEXT:    li a7, 10
+; RV64I-NEXT:    li t0, 9
+; RV64I-NEXT:    li t1, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 5
+; RV64I-NEXT:    sd a6, 40(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
 ; RV64I-NEXT:    li a5, 6
-; RV64I-NEXT:    sd a7, 40(sp)
-; RV64I-NEXT:    sd a6, 48(sp)
-; RV64I-NEXT:    li a7, 7
-; RV64I-NEXT:    sd t2, 0(sp)
-; RV64I-NEXT:    sd t1, 16(sp)
+; RV64I-NEXT:    sd t1, 0(sp)
+; RV64I-NEXT:    sd t0, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd t0, 32(sp)
+; RV64I-NEXT:    sd a7, 32(sp)
+; RV64I-NEXT:    li a7, 7
 ; RV64I-NEXT:    li a6, 0
 ; RV64I-NEXT:    call callee_aligned_stack
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll
index c2db8fe5248fd..d43f43ceffec3 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll
@@ -112,8 +112,8 @@ define i64 @caller_float_on_stack() nounwind {
 ; RV64I-FPELIM-NEXT:    li a0, 1
 ; RV64I-FPELIM-NEXT:    li a2, 2
 ; RV64I-FPELIM-NEXT:    li a4, 3
-; RV64I-FPELIM-NEXT:    li a6, 4
 ; RV64I-FPELIM-NEXT:    sd a1, 0(sp)
+; RV64I-FPELIM-NEXT:    li a6, 4
 ; RV64I-FPELIM-NEXT:    li a1, 0
 ; RV64I-FPELIM-NEXT:    li a3, 0
 ; RV64I-FPELIM-NEXT:    li a5, 0
@@ -133,8 +133,8 @@ define i64 @caller_float_on_stack() nounwind {
 ; RV64I-WITHFP-NEXT:    li a0, 1
 ; RV64I-WITHFP-NEXT:    li a2, 2
 ; RV64I-WITHFP-NEXT:    li a4, 3
-; RV64I-WITHFP-NEXT:    li a6, 4
 ; RV64I-WITHFP-NEXT:    sd a1, 0(sp)
+; RV64I-WITHFP-NEXT:    li a6, 4
 ; RV64I-WITHFP-NEXT:    li a1, 0
 ; RV64I-WITHFP-NEXT:    li a3, 0
 ; RV64I-WITHFP-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll
index 985135a086e24..cc10e900faa0b 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll
@@ -118,10 +118,10 @@ define i64 @caller_float_on_stack() nounwind {
 ; RV64I-LP64E-FPELIM-NEXT:    li a3, 4
 ; RV64I-LP64E-FPELIM-NEXT:    li a0, 1
 ; RV64I-LP64E-FPELIM-NEXT:    li a2, 2
-; RV64I-LP64E-FPELIM-NEXT:    li a4, 3
 ; RV64I-LP64E-FPELIM-NEXT:    sd a3, 0(sp)
 ; RV64I-LP64E-FPELIM-NEXT:    sd zero, 8(sp)
 ; RV64I-LP64E-FPELIM-NEXT:    sd a1, 16(sp)
+; RV64I-LP64E-FPELIM-NEXT:    li a4, 3
 ; RV64I-LP64E-FPELIM-NEXT:    li a1, 0
 ; RV64I-LP64E-FPELIM-NEXT:    li a3, 0
 ; RV64I-LP64E-FPELIM-NEXT:    li a5, 0
@@ -143,10 +143,10 @@ define i64 @caller_float_on_stack() nounwind {
 ; RV64I-LP64E-WITHFP-NEXT:    li a3, 4
 ; RV64I-LP64E-WITHFP-NEXT:    li a0, 1
 ; RV64I-LP64E-WITHFP-NEXT:    li a2, 2
-; RV64I-LP64E-WITHFP-NEXT:    li a4, 3
 ; RV64I-LP64E-WITHFP-NEXT:    sd a3, 0(sp)
 ; RV64I-LP64E-WITHFP-NEXT:    sd zero, 8(sp)
 ; RV64I-LP64E-WITHFP-NEXT:    sd a1, 16(sp)
+; RV64I-LP64E-WITHFP-NEXT:    li a4, 3
 ; RV64I-LP64E-WITHFP-NEXT:    li a1, 0
 ; RV64I-LP64E-WITHFP-NEXT:    li a3, 0
 ; RV64I-LP64E-WITHFP-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll
index eaba1acffa054..284de1988d37e 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll
@@ -37,9 +37,9 @@ define float @caller_onstack_f32_noop(float %a) nounwind {
 ; RV32IF-NEXT:    li a0, 1
 ; RV32IF-NEXT:    li a2, 2
 ; RV32IF-NEXT:    li a4, 3
-; RV32IF-NEXT:    li a6, 4
 ; RV32IF-NEXT:    sw a3, 0(sp)
 ; RV32IF-NEXT:    sw a1, 4(sp)
+; RV32IF-NEXT:    li a6, 4
 ; RV32IF-NEXT:    li a1, 0
 ; RV32IF-NEXT:    li a3, 0
 ; RV32IF-NEXT:    li a5, 0
@@ -61,12 +61,12 @@ define float @caller_onstack_f32_fadd(float %a, float %b) nounwind {
 ; RV32IF-NEXT:    fmv.w.x fa4, a0
 ; RV32IF-NEXT:    fadd.s fa3, fa4, fa5
 ; RV32IF-NEXT:    fsub.s fa5, fa5, fa4
+; RV32IF-NEXT:    fsw fa3, 0(sp)
+; RV32IF-NEXT:    fsw fa5, 4(sp)
 ; RV32IF-NEXT:    li a0, 1
 ; RV32IF-NEXT:    li a2, 2
 ; RV32IF-NEXT:    li a4, 3
 ; RV32IF-NEXT:    li a6, 4
-; RV32IF-NEXT:    fsw fa3, 0(sp)
-; RV32IF-NEXT:    fsw fa5, 4(sp)
 ; RV32IF-NEXT:    li a1, 0
 ; RV32IF-NEXT:    li a3, 0
 ; RV32IF-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll
index 63d4ea5fee331..6bc0e773f0aff 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll
@@ -34,14 +34,14 @@ define float @caller_onstack_f32_noop(float %a) nounwind {
 ; RV32IF-ILP32E-NEXT:    sw ra, 16(sp) # 4-byte Folded Spill
 ; RV32IF-ILP32E-NEXT:    mv a1, a0
 ; RV32IF-ILP32E-NEXT:    lui a3, 264704
-; RV32IF-ILP32E-NEXT:    li a5, 4
+; RV32IF-ILP32E-NEXT:    li a4, 4
 ; RV32IF-ILP32E-NEXT:    li a0, 1
 ; RV32IF-ILP32E-NEXT:    li a2, 2
-; RV32IF-ILP32E-NEXT:    li a4, 3
-; RV32IF-ILP32E-NEXT:    sw a5, 0(sp)
+; RV32IF-ILP32E-NEXT:    sw a4, 0(sp)
 ; RV32IF-ILP32E-NEXT:    sw zero, 4(sp)
 ; RV32IF-ILP32E-NEXT:    sw a3, 8(sp)
 ; RV32IF-ILP32E-NEXT:    sw a1, 12(sp)
+; RV32IF-ILP32E-NEXT:    li a4, 3
 ; RV32IF-ILP32E-NEXT:    li a1, 0
 ; RV32IF-ILP32E-NEXT:    li a3, 0
 ; RV32IF-ILP32E-NEXT:    li a5, 0
@@ -65,11 +65,11 @@ define float @caller_onstack_f32_fadd(float %a, float %b) nounwind {
 ; RV32IF-ILP32E-NEXT:    li a1, 4
 ; RV32IF-ILP32E-NEXT:    li a0, 1
 ; RV32IF-ILP32E-NEXT:    li a2, 2
-; RV32IF-ILP32E-NEXT:    li a4, 3
 ; RV32IF-ILP32E-NEXT:    sw a1, 0(sp)
 ; RV32IF-ILP32E-NEXT:    sw zero, 4(sp)
 ; RV32IF-ILP32E-NEXT:    fsw fa3, 8(sp)
 ; RV32IF-ILP32E-NEXT:    fsw fa5, 12(sp)
+; RV32IF-ILP32E-NEXT:    li a4, 3
 ; RV32IF-ILP32E-NEXT:    li a1, 0
 ; RV32IF-ILP32E-NEXT:    li a3, 0
 ; RV32IF-ILP32E-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/calls.ll b/llvm/test/CodeGen/RISCV/calls.ll
index cf0e625f3c6c7..6aef8b18f5b77 100644
--- a/llvm/test/CodeGen/RISCV/calls.ll
+++ b/llvm/test/CodeGen/RISCV/calls.ll
@@ -654,11 +654,11 @@ define i32 @test_call_external_many_args(i32 %a) nounwind {
 ; RV64I-LARGE-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-LARGE-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-LARGE-NEXT:    mv s0, a0
+; RV64I-LARGE-NEXT:    sd a0, 0(sp)
+; RV64I-LARGE-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-NEXT:  .Lpcrel_hi4:
 ; RV64I-LARGE-NEXT:    auipc a0, %pcrel_hi(.LCPI8_0)
 ; RV64I-LARGE-NEXT:    ld t1, %pcrel_lo(.Lpcrel_hi4)(a0)
-; RV64I-LARGE-NEXT:    sd s0, 0(sp)
-; RV64I-LARGE-NEXT:    sd s0, 8(sp)
 ; RV64I-LARGE-NEXT:    mv a0, s0
 ; RV64I-LARGE-NEXT:    mv a1, s0
 ; RV64I-LARGE-NEXT:    mv a2, s0
@@ -681,11 +681,11 @@ define i32 @test_call_external_many_args(i32 %a) nounwind {
 ; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64I-LARGE-ZICFILP-NEXT:    mv s0, a0
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 0(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi4:
 ; RV64I-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI8_0)
 ; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi4)(a0)
-; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 0(sp)
-; RV64I-LARGE-ZICFILP-NEXT:    sd s0, 8(sp)
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a0, s0
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a1, s0
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a2, s0
@@ -823,11 +823,11 @@ define i32 @test_call_defined_many_args(i32 %a) nounwind {
 ; RV64I-LARGE:       # %bb.0:
 ; RV64I-LARGE-NEXT:    addi sp, sp, -32
 ; RV64I-LARGE-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-LARGE-NEXT:    sd a0, 0(sp)
+; RV64I-LARGE-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-NEXT:  .Lpcrel_hi5:
 ; RV64I-LARGE-NEXT:    auipc a1, %pcrel_hi(.LCPI10_0)
 ; RV64I-LARGE-NEXT:    ld t1, %pcrel_lo(.Lpcrel_hi5)(a1)
-; RV64I-LARGE-NEXT:    sd a0, 0(sp)
-; RV64I-LARGE-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-NEXT:    mv a1, a0
 ; RV64I-LARGE-NEXT:    mv a2, a0
 ; RV64I-LARGE-NEXT:    mv a3, a0
@@ -845,11 +845,11 @@ define i32 @test_call_defined_many_args(i32 %a) nounwind {
 ; RV64I-LARGE-ZICFILP-NEXT:    lpad 0
 ; RV64I-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
 ; RV64I-LARGE-ZICFILP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 0(sp)
+; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-ZICFILP-NEXT:  .Lpcrel_hi5:
 ; RV64I-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI10_0)
 ; RV64I-LARGE-ZICFILP-NEXT:    ld t2, %pcrel_lo(.Lpcrel_hi5)(a1)
-; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 0(sp)
-; RV64I-LARGE-ZICFILP-NEXT:    sd a0, 8(sp)
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a1, a0
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a2, a0
 ; RV64I-LARGE-ZICFILP-NEXT:    mv a3, a0
diff --git a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
index 4831f0b24c7fe..ab8460d944b33 100644
--- a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll
@@ -119,9 +119,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind {
 ; RV32I-SMALL-NEXT:    addi sp, sp, -16
 ; RV32I-SMALL-NEXT:    lui a1, %hi(.Ltmp0)
 ; RV32I-SMALL-NEXT:    addi a1, a1, %lo(.Ltmp0)
-; RV32I-SMALL-NEXT:    li a2, 101
 ; RV32I-SMALL-NEXT:    sw a1, 8(sp)
-; RV32I-SMALL-NEXT:    blt a0, a2, .LBB2_3
+; RV32I-SMALL-NEXT:    li a1, 101
+; RV32I-SMALL-NEXT:    blt a0, a1, .LBB2_3
 ; RV32I-SMALL-NEXT:  # %bb.1: # %if.then
 ; RV32I-SMALL-NEXT:    lw a0, 8(sp)
 ; RV32I-SMALL-NEXT:    jr a0
@@ -141,9 +141,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind {
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi2:
 ; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.Ltmp0)
 ; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi2)
-; RV32I-MEDIUM-NEXT:    li a2, 101
 ; RV32I-MEDIUM-NEXT:    sw a1, 8(sp)
-; RV32I-MEDIUM-NEXT:    blt a0, a2, .LBB2_3
+; RV32I-MEDIUM-NEXT:    li a1, 101
+; RV32I-MEDIUM-NEXT:    blt a0, a1, .LBB2_3
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %if.then
 ; RV32I-MEDIUM-NEXT:    lw a0, 8(sp)
 ; RV32I-MEDIUM-NEXT:    jr a0
@@ -162,9 +162,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind {
 ; RV64I-SMALL-NEXT:    addi sp, sp, -16
 ; RV64I-SMALL-NEXT:    lui a1, %hi(.Ltmp0)
 ; RV64I-SMALL-NEXT:    addi a1, a1, %lo(.Ltmp0)
-; RV64I-SMALL-NEXT:    li a2, 101
 ; RV64I-SMALL-NEXT:    sd a1, 8(sp)
-; RV64I-SMALL-NEXT:    blt a0, a2, .LBB2_3
+; RV64I-SMALL-NEXT:    li a1, 101
+; RV64I-SMALL-NEXT:    blt a0, a1, .LBB2_3
 ; RV64I-SMALL-NEXT:  # %bb.1: # %if.then
 ; RV64I-SMALL-NEXT:    ld a0, 8(sp)
 ; RV64I-SMALL-NEXT:    jr a0
@@ -184,9 +184,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind {
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi2:
 ; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.Ltmp0)
 ; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi2)
-; RV64I-MEDIUM-NEXT:    li a2, 101
 ; RV64I-MEDIUM-NEXT:    sd a1, 8(sp)
-; RV64I-MEDIUM-NEXT:    blt a0, a2, .LBB2_3
+; RV64I-MEDIUM-NEXT:    li a1, 101
+; RV64I-MEDIUM-NEXT:    blt a0, a1, .LBB2_3
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %if.then
 ; RV64I-MEDIUM-NEXT:    ld a0, 8(sp)
 ; RV64I-MEDIUM-NEXT:    jr a0
@@ -206,9 +206,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind {
 ; RV64I-LARGE-NEXT:  .Lpcrel_hi2:
 ; RV64I-LARGE-NEXT:    auipc a1, %pcrel_hi(.Ltmp0)
 ; RV64I-LARGE-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi2)
-; RV64I-LARGE-NEXT:    li a2, 101
 ; RV64I-LARGE-NEXT:    sd a1, 8(sp)
-; RV64I-LARGE-NEXT:    blt a0, a2, .LBB2_3
+; RV64I-LARGE-NEXT:    li a1, 101
+; RV64I-LARGE-NEXT:    blt a0, a1, .LBB2_3
 ; RV64I-LARGE-NEXT:  # %bb.1: # %if.then
 ; RV64I-LARGE-NEXT:    ld a0, 8(sp)
 ; RV64I-LARGE-NEXT:    jr a0
diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll
index dc81c13bfb6a3..e898661665e99 100644
--- a/llvm/test/CodeGen/RISCV/condbinops.ll
+++ b/llvm/test/CodeGen/RISCV/condbinops.ll
@@ -411,8 +411,8 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a4, a4, a2
-; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    sll a2, a0, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a1, a2
@@ -486,8 +486,8 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    sra a0, a1, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srai a1, a1, 31
@@ -496,10 +496,9 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    srl a3, a3, a2
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    or a3, a3, a1
+; RV32I-NEXT:    sll a2, a1, a2
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    or a0, a3, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr64:
@@ -562,8 +561,8 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a4, a4, a2
-; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    srl a2, a1, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 6c2ba493ffcd5..bd9e543e955d5 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -1348,13 +1348,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    beqz a1, .LBB23_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB23_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a6
+; RV32I-NEXT:    mv a4, a6
 ; RV32I-NEXT:    mv a5, a7
 ; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    ret
 ;
@@ -1425,13 +1425,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    bnez a1, .LBB24_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB24_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a6
+; RV32I-NEXT:    mv a4, a6
 ; RV32I-NEXT:    mv a5, a7
 ; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    ret
 ;
@@ -2196,13 +2196,13 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a1, .LBB33_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB33_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB33_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2264,13 +2264,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: setne_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB34_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB34_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2333,13 +2333,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_constant:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 123
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a1, .LBB35_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB35_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB35_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2408,13 +2408,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: setne_constant:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 456
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB36_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB36_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2483,13 +2483,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_2048:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    binvi a0, a0, 11
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a1, .LBB37_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB37_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2559,13 +2559,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    xori a0, a0, -2048
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a1, .LBB38_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    beqz a0, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB38_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -2637,13 +2637,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    xori a0, a0, -2048
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB39_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB39_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB39_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll
index 53de36f1699a9..5400ec6d005ef 100644
--- a/llvm/test/CodeGen/RISCV/copysign-casts.ll
+++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll
@@ -702,17 +702,17 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFD-LABEL: fold_demote_h_d:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    fmv.x.w a0, fa0
 ; RV32IFD-NEXT:    fsd fa1, 8(sp)
-; RV32IFD-NEXT:    lw a0, 12(sp)
-; RV32IFD-NEXT:    fmv.x.w a1, fa0
-; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    and a0, a0, a2
+; RV32IFD-NEXT:    lui a1, 524288
+; RV32IFD-NEXT:    lw a2, 12(sp)
+; RV32IFD-NEXT:    and a1, a2, a1
 ; RV32IFD-NEXT:    lui a2, 1048560
-; RV32IFD-NEXT:    slli a1, a1, 17
-; RV32IFD-NEXT:    srli a1, a1, 17
-; RV32IFD-NEXT:    srli a0, a0, 16
-; RV32IFD-NEXT:    or a1, a1, a2
-; RV32IFD-NEXT:    or a0, a1, a0
+; RV32IFD-NEXT:    slli a0, a0, 17
+; RV32IFD-NEXT:    srli a0, a0, 17
+; RV32IFD-NEXT:    srli a1, a1, 16
+; RV32IFD-NEXT:    or a0, a0, a2
+; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    fmv.w.x fa0, a0
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index da97ac0d74237..a098de49f8410 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -475,10 +475,10 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    beqz a0, .LBB3_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    neg a1, a0
+; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, %hi(.LCPI3_0)
 ; RV64M-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; RV64M-NEXT:    neg a2, a0
-; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 58
 ; RV64M-NEXT:    lui a1, %hi(.LCPI3_1)
@@ -889,10 +889,10 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ;
 ; RV64M-LABEL: test_cttz_i64_zero_undef:
 ; RV64M:       # %bb.0:
+; RV64M-NEXT:    neg a1, a0
+; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, %hi(.LCPI7_0)
 ; RV64M-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; RV64M-NEXT:    neg a2, a0
-; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 58
 ; RV64M-NEXT:    lui a1, %hi(.LCPI7_1)
diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
index 798eac64e9fc2..51f75c10462d0 100644
--- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
@@ -93,8 +93,8 @@ define double @callee_double_split_reg_stack(i32 %a, i64 %b, i64 %c, double %d,
 ; RV32IZFINXZDINX-LABEL: callee_double_split_reg_stack:
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    mv a0, a7
-; RV32IZFINXZDINX-NEXT:    lw a1, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    mv a3, a6
+; RV32IZFINXZDINX-NEXT:    lw a1, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    mv a2, a5
 ; RV32IZFINXZDINX-NEXT:    fadd.d a0, a2, a0
 ; RV32IZFINXZDINX-NEXT:    ret
@@ -115,8 +115,8 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IFD-NEXT:    addi a2, a2, 327
 ; RV32IFD-NEXT:    addi a6, a3, 327
 ; RV32IFD-NEXT:    addi a5, a4, -1311
-; RV32IFD-NEXT:    li a3, 3
 ; RV32IFD-NEXT:    sw a2, 0(sp)
+; RV32IFD-NEXT:    li a3, 3
 ; RV32IFD-NEXT:    li a2, 0
 ; RV32IFD-NEXT:    li a4, 0
 ; RV32IFD-NEXT:    mv a7, a5
@@ -137,8 +137,8 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi a2, a2, 327
 ; RV32IZFINXZDINX-NEXT:    addi a6, a3, 327
 ; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1311
-; RV32IZFINXZDINX-NEXT:    li a3, 3
 ; RV32IZFINXZDINX-NEXT:    sw a2, 0(sp)
+; RV32IZFINXZDINX-NEXT:    li a3, 3
 ; RV32IZFINXZDINX-NEXT:    li a2, 0
 ; RV32IZFINXZDINX-NEXT:    li a4, 0
 ; RV32IZFINXZDINX-NEXT:    mv a7, a5
@@ -186,7 +186,6 @@ define double @caller_double_stack() nounwind {
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:    li a2, 2
 ; RV32IFD-NEXT:    li a4, 3
-; RV32IFD-NEXT:    li a6, 4
 ; RV32IFD-NEXT:    addi a1, a1, 327
 ; RV32IFD-NEXT:    addi a3, a3, -1311
 ; RV32IFD-NEXT:    addi a5, a5, 327
@@ -194,6 +193,7 @@ define double @caller_double_stack() nounwind {
 ; RV32IFD-NEXT:    sw a1, 4(sp)
 ; RV32IFD-NEXT:    sw a3, 8(sp)
 ; RV32IFD-NEXT:    sw a5, 12(sp)
+; RV32IFD-NEXT:    li a6, 4
 ; RV32IFD-NEXT:    li a1, 0
 ; RV32IFD-NEXT:    li a3, 0
 ; RV32IFD-NEXT:    li a5, 0
@@ -213,7 +213,6 @@ define double @caller_double_stack() nounwind {
 ; RV32IZFINXZDINX-NEXT:    li a0, 1
 ; RV32IZFINXZDINX-NEXT:    li a2, 2
 ; RV32IZFINXZDINX-NEXT:    li a4, 3
-; RV32IZFINXZDINX-NEXT:    li a6, 4
 ; RV32IZFINXZDINX-NEXT:    addi a1, a1, 327
 ; RV32IZFINXZDINX-NEXT:    addi a3, a3, -1311
 ; RV32IZFINXZDINX-NEXT:    addi a5, a5, 327
@@ -221,6 +220,7 @@ define double @caller_double_stack() nounwind {
 ; RV32IZFINXZDINX-NEXT:    sw a1, 4(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a5, 12(sp)
+; RV32IZFINXZDINX-NEXT:    li a6, 4
 ; RV32IZFINXZDINX-NEXT:    li a1, 0
 ; RV32IZFINXZDINX-NEXT:    li a3, 0
 ; RV32IZFINXZDINX-NEXT:    li a5, 0
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index c39085a80ddc1..052cfd6adff06 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -687,9 +687,9 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -1624,13 +1624,13 @@ define signext i16 @fcvt_w_s_i16(double %a) nounwind {
 define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV32IFD-LABEL: fcvt_w_s_sat_i16:
 ; RV32IFD:       # %bb.0: # %start
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI26_1)
-; RV32IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a0)
 ; RV32IFD-NEXT:    feq.d a0, fa0, fa0
-; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    lui a1, %hi(.LCPI26_0)
+; RV32IFD-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; RV32IFD-NEXT:    lui a1, %hi(.LCPI26_1)
 ; RV32IFD-NEXT:    neg a0, a0
+; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a1)
 ; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.w.d a1, fa5, rtz
 ; RV32IFD-NEXT:    and a0, a0, a1
@@ -1638,13 +1638,13 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ;
 ; RV64IFD-LABEL: fcvt_w_s_sat_i16:
 ; RV64IFD:       # %bb.0: # %start
-; RV64IFD-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV64IFD-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
-; RV64IFD-NEXT:    lui a0, %hi(.LCPI26_1)
-; RV64IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a0)
 ; RV64IFD-NEXT:    feq.d a0, fa0, fa0
-; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    lui a1, %hi(.LCPI26_0)
+; RV64IFD-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; RV64IFD-NEXT:    lui a1, %hi(.LCPI26_1)
 ; RV64IFD-NEXT:    neg a0, a0
+; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a1)
 ; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.l.d a1, fa5, rtz
 ; RV64IFD-NEXT:    and a0, a0, a1
@@ -1653,31 +1653,31 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFINXZDINX:       # %bb.0: # %start
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI26_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI26_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI26_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI26_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI26_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fmax.d a2, a0, a2
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI26_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI26_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI26_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fmax.d a4, a0, a4
 ; RV32IZFINXZDINX-NEXT:    feq.d a0, a0, a0
 ; RV32IZFINXZDINX-NEXT:    neg a0, a0
-; RV32IZFINXZDINX-NEXT:    fmin.d a2, a2, a4
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI26_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI26_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fmin.d a2, a4, a2
 ; RV32IZFINXZDINX-NEXT:    fcvt.w.d a1, a2, rtz
 ; RV32IZFINXZDINX-NEXT:    and a0, a0, a1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
-; RV64IZFINXZDINX-NEXT:    li a1, -505
+; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
+; RV64IZFINXZDINX-NEXT:    li a2, -505
+; RV64IZFINXZDINX-NEXT:    slli a2, a2, 53
+; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_0)
-; RV64IZFINXZDINX-NEXT:    slli a1, a1, 53
+; RV64IZFINXZDINX-NEXT:    neg a1, a1
 ; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI26_0)(a2)
-; RV64IZFINXZDINX-NEXT:    fmax.d a1, a0, a1
-; RV64IZFINXZDINX-NEXT:    feq.d a0, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a0, a0
-; RV64IZFINXZDINX-NEXT:    fmin.d a1, a1, a2
-; RV64IZFINXZDINX-NEXT:    fcvt.l.d a1, a1, rtz
-; RV64IZFINXZDINX-NEXT:    and a0, a0, a1
+; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
+; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz
+; RV64IZFINXZDINX-NEXT:    and a0, a1, a0
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i16:
@@ -1829,40 +1829,40 @@ define zeroext i16 @fcvt_wu_s_i16(double %a) nounwind {
 define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind {
 ; RV32IFD-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IFD:       # %bb.0: # %start
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI28_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI28_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fmax.d fa4, fa0, fa4
-; RV32IFD-NEXT:    fmin.d fa5, fa4, fa5
+; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    fld fa4, %lo(.LCPI28_0)(a0)
+; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.wu.d a0, fa5, rtz
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IFD:       # %bb.0: # %start
+; RV64IFD-NEXT:    fmv.d.x fa5, zero
 ; RV64IFD-NEXT:    lui a0, %hi(.LCPI28_0)
-; RV64IFD-NEXT:    fld fa5, %lo(.LCPI28_0)(a0)
-; RV64IFD-NEXT:    fmv.d.x fa4, zero
-; RV64IFD-NEXT:    fmax.d fa4, fa0, fa4
-; RV64IFD-NEXT:    fmin.d fa5, fa4, fa5
+; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    fld fa4, %lo(.LCPI28_0)(a0)
+; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.lu.d a0, fa5, rtz
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZFINXZDINX:       # %bb.0: # %start
+; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
+; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI28_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI28_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI28_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fcvt.d.w a4, zero
-; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a4
 ; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    fcvt.wu.d a0, a0, rtz
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
+; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, zero
 ; RV64IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI28_0)
 ; RV64IZFINXZDINX-NEXT:    ld a1, %lo(.LCPI28_0)(a1)
-; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, zero
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a1
 ; RV64IZFINXZDINX-NEXT:    fcvt.lu.d a0, a0, rtz
 ; RV64IZFINXZDINX-NEXT:    ret
@@ -1999,13 +1999,13 @@ define signext i8 @fcvt_w_s_i8(double %a) nounwind {
 define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV32IFD-LABEL: fcvt_w_s_sat_i8:
 ; RV32IFD:       # %bb.0: # %start
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI30_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI30_0)(a0)
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI30_1)
-; RV32IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a0)
 ; RV32IFD-NEXT:    feq.d a0, fa0, fa0
-; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    lui a1, %hi(.LCPI30_0)
+; RV32IFD-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
+; RV32IFD-NEXT:    lui a1, %hi(.LCPI30_1)
 ; RV32IFD-NEXT:    neg a0, a0
+; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a1)
 ; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.w.d a1, fa5, rtz
 ; RV32IFD-NEXT:    and a0, a0, a1
@@ -2013,13 +2013,13 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ;
 ; RV64IFD-LABEL: fcvt_w_s_sat_i8:
 ; RV64IFD:       # %bb.0: # %start
-; RV64IFD-NEXT:    lui a0, %hi(.LCPI30_0)
-; RV64IFD-NEXT:    fld fa5, %lo(.LCPI30_0)(a0)
-; RV64IFD-NEXT:    lui a0, %hi(.LCPI30_1)
-; RV64IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a0)
 ; RV64IFD-NEXT:    feq.d a0, fa0, fa0
-; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    lui a1, %hi(.LCPI30_0)
+; RV64IFD-NEXT:    fld fa5, %lo(.LCPI30_0)(a1)
+; RV64IFD-NEXT:    lui a1, %hi(.LCPI30_1)
 ; RV64IFD-NEXT:    neg a0, a0
+; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a1)
 ; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.l.d a1, fa5, rtz
 ; RV64IFD-NEXT:    and a0, a0, a1
@@ -2028,15 +2028,15 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZFINXZDINX:       # %bb.0: # %start
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI30_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI30_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI30_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI30_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI30_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI30_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fmax.d a2, a0, a2
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI30_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI30_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI30_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fmax.d a4, a0, a4
 ; RV32IZFINXZDINX-NEXT:    feq.d a0, a0, a0
 ; RV32IZFINXZDINX-NEXT:    neg a0, a0
-; RV32IZFINXZDINX-NEXT:    fmin.d a2, a2, a4
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI30_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI30_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fmin.d a2, a4, a2
 ; RV32IZFINXZDINX-NEXT:    fcvt.w.d a1, a2, rtz
 ; RV32IZFINXZDINX-NEXT:    and a0, a0, a1
 ; RV32IZFINXZDINX-NEXT:    ret
@@ -2203,31 +2203,31 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind {
 ;
 ; RV32IFD-LABEL: fcvt_wu_s_sat_i8:
 ; RV32IFD:       # %bb.0: # %start
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI32_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fmax.d fa4, fa0, fa4
-; RV32IFD-NEXT:    fmin.d fa5, fa4, fa5
+; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    fld fa4, %lo(.LCPI32_0)(a0)
+; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.wu.d a0, fa5, rtz
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: fcvt_wu_s_sat_i8:
 ; RV64IFD:       # %bb.0: # %start
+; RV64IFD-NEXT:    fmv.d.x fa5, zero
 ; RV64IFD-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV64IFD-NEXT:    fld fa5, %lo(.LCPI32_0)(a0)
-; RV64IFD-NEXT:    fmv.d.x fa4, zero
-; RV64IFD-NEXT:    fmax.d fa4, fa0, fa4
-; RV64IFD-NEXT:    fmin.d fa5, fa4, fa5
+; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    fld fa4, %lo(.LCPI32_0)(a0)
+; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.lu.d a0, fa5, rtz
 ; RV64IFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i8:
 ; RV32IZFINXZDINX:       # %bb.0: # %start
+; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
+; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI32_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI32_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI32_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fcvt.d.w a4, zero
-; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a4
 ; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    fcvt.wu.d a0, a0, rtz
 ; RV32IZFINXZDINX-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
index 949668f640dbd..30f995207851f 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
@@ -275,8 +275,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    flt.d a2, fa1, fa0
 ; CHECKIFD-NEXT:    fsflags a0
-; CHECKIFD-NEXT:    or a0, a2, a1
 ; CHECKIFD-NEXT:    feq.d zero, fa1, fa0
+; CHECKIFD-NEXT:    or a0, a2, a1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_one:
@@ -288,9 +288,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    flt.d a6, a2, a0
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    or a4, a6, a5
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a6, a5
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_one:
@@ -302,9 +301,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    flt.d a4, a1, a0
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    or a2, a4, a3
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    or a0, a4, a3
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_one:
@@ -423,9 +421,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    flt.d a2, fa1, fa0
 ; CHECKIFD-NEXT:    fsflags a0
+; CHECKIFD-NEXT:    feq.d zero, fa1, fa0
 ; CHECKIFD-NEXT:    or a1, a2, a1
 ; CHECKIFD-NEXT:    xori a0, a1, 1
-; CHECKIFD-NEXT:    feq.d zero, fa1, fa0
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_ueq:
@@ -437,10 +435,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    flt.d a6, a2, a0
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    or a4, a6, a5
-; RV32IZFINXZDINX-NEXT:    xori a4, a4, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a6, a5
+; RV32IZFINXZDINX-NEXT:    xori a0, a0, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ueq:
@@ -452,10 +449,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    flt.d a4, a1, a0
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    or a3, a4, a3
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    or a3, a4, a3
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ueq:
@@ -522,8 +518,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    fle.d a1, fa0, fa1
 ; CHECKIFD-NEXT:    fsflags a0
-; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    feq.d zero, fa0, fa1
+; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_ugt:
@@ -531,9 +527,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    fle.d a5, a0, a2
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a0, a2
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ugt:
@@ -541,9 +536,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    fle.d a3, a0, a1
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a0, a1
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ugt:
@@ -576,8 +570,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    flt.d a1, fa0, fa1
 ; CHECKIFD-NEXT:    fsflags a0
-; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    feq.d zero, fa0, fa1
+; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_uge:
@@ -585,9 +579,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    flt.d a5, a0, a2
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a0, a2
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_uge:
@@ -595,9 +588,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    flt.d a3, a0, a1
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a0, a1
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_uge:
@@ -632,8 +624,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    fle.d a1, fa1, fa0
 ; CHECKIFD-NEXT:    fsflags a0
-; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    feq.d zero, fa1, fa0
+; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_ult:
@@ -641,9 +633,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    fle.d a5, a2, a0
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ult:
@@ -651,9 +642,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    fle.d a3, a1, a0
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ult:
@@ -686,8 +676,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
 ; CHECKIFD-NEXT:    frflags a0
 ; CHECKIFD-NEXT:    flt.d a1, fa1, fa0
 ; CHECKIFD-NEXT:    fsflags a0
-; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    feq.d zero, fa1, fa0
+; CHECKIFD-NEXT:    xori a0, a1, 1
 ; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcmp_ule:
@@ -695,9 +685,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
 ; RV32IZFINXZDINX-NEXT:    frflags a4
 ; RV32IZFINXZDINX-NEXT:    flt.d a5, a2, a0
 ; RV32IZFINXZDINX-NEXT:    fsflags a4
-; RV32IZFINXZDINX-NEXT:    xori a4, a5, 1
 ; RV32IZFINXZDINX-NEXT:    feq.d zero, a2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, a4
+; RV32IZFINXZDINX-NEXT:    xori a0, a5, 1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcmp_ule:
@@ -705,9 +694,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
 ; RV64IZFINXZDINX-NEXT:    frflags a2
 ; RV64IZFINXZDINX-NEXT:    flt.d a3, a1, a0
 ; RV64IZFINXZDINX-NEXT:    fsflags a2
-; RV64IZFINXZDINX-NEXT:    xori a2, a3, 1
 ; RV64IZFINXZDINX-NEXT:    feq.d zero, a1, a0
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    xori a0, a3, 1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ule:
diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll
index 155827ad069cc..97fc1bde6155f 100644
--- a/llvm/test/CodeGen/RISCV/double-imm.ll
+++ b/llvm/test/CodeGen/RISCV/double-imm.ll
@@ -158,12 +158,12 @@ define dso_local double @negzero_sel(i16 noundef %a, double noundef %d) nounwind
 ;
 ; CHECKRV64ZDINX-LABEL: negzero_sel:
 ; CHECKRV64ZDINX:       # %bb.0: # %entry
-; CHECKRV64ZDINX-NEXT:    slli a2, a0, 48
-; CHECKRV64ZDINX-NEXT:    mv a0, a1
-; CHECKRV64ZDINX-NEXT:    beqz a2, .LBB4_2
+; CHECKRV64ZDINX-NEXT:    slli a0, a0, 48
+; CHECKRV64ZDINX-NEXT:    beqz a0, .LBB4_2
 ; CHECKRV64ZDINX-NEXT:  # %bb.1: # %entry
-; CHECKRV64ZDINX-NEXT:    fneg.d a0, zero
+; CHECKRV64ZDINX-NEXT:    fneg.d a1, zero
 ; CHECKRV64ZDINX-NEXT:  .LBB4_2: # %entry
+; CHECKRV64ZDINX-NEXT:    mv a0, a1
 ; CHECKRV64ZDINX-NEXT:    ret
 entry:
   %tobool.not = icmp eq i16 %a, 0
diff --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll
index dba9489e7511d..134c8cb0689ca 100644
--- a/llvm/test/CodeGen/RISCV/double-mem.ll
+++ b/llvm/test/CodeGen/RISCV/double-mem.ll
@@ -51,10 +51,10 @@ define dso_local void @fsd(ptr %a, double %b, double %c) nounwind {
 ; RV32IZFINXZDINX-LABEL: fsd:
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    mv a5, a4
-; RV32IZFINXZDINX-NEXT:    mv a7, a2
 ; RV32IZFINXZDINX-NEXT:    mv a4, a3
-; RV32IZFINXZDINX-NEXT:    mv a6, a1
-; RV32IZFINXZDINX-NEXT:    fadd.d a2, a6, a4
+; RV32IZFINXZDINX-NEXT:    mv a3, a2
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    fadd.d a2, a2, a4
 ; RV32IZFINXZDINX-NEXT:    sw a2, 0(a0)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 4(a0)
 ; RV32IZFINXZDINX-NEXT:    sw a2, 64(a0)
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index cd87f2d2301d7..8ebeeabec4a09 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -48,9 +48,9 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call floor
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI1_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI1_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -103,9 +103,9 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI1_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -185,12 +185,12 @@ define i64 @test_floor_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call floor
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
@@ -292,9 +292,9 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call ceil
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI5_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI5_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -347,9 +347,9 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI5_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -429,12 +429,12 @@ define i64 @test_ceil_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call ceil
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI7_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI7_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
@@ -536,9 +536,9 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call trunc
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI9_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -591,9 +591,9 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI9_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -673,12 +673,12 @@ define i64 @test_trunc_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call trunc
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI11_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
@@ -780,9 +780,9 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call round
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI13_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -835,9 +835,9 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI13_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -917,12 +917,12 @@ define i64 @test_round_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call round
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI15_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI15_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
@@ -1024,9 +1024,9 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call roundeven
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI17_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -1079,9 +1079,9 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI17_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -1161,12 +1161,12 @@ define i64 @test_roundeven_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call roundeven
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
@@ -1268,9 +1268,9 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    call rint
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI21_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
-; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
 ; RV32IFD-NEXT:    lui a3, 524288
@@ -1323,9 +1323,9 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI21_1)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_1)(a3)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_1+4)(a3)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
 ; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
 ; RV32IZFINXZDINX-NEXT:    lui a4, 524288
@@ -1405,12 +1405,12 @@ define i64 @test_rint_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    call rint
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI23_0)
+; RV32IFD-NEXT:    fcvt.d.w fa5, zero
+; RV32IFD-NEXT:    fle.d a1, fa5, fa0
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
-; RV32IFD-NEXT:    fcvt.d.w fa4, zero
-; RV32IFD-NEXT:    fle.d a0, fa4, fa0
-; RV32IFD-NEXT:    flt.d a1, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a1
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    flt.d a0, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    neg s1, a1
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
diff --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
index e7ff991413013..10c417174e7fd 100644
--- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
@@ -545,22 +545,22 @@ define i32 @i32_select_fcmp_oeq(double %a, double %b, i32 %c, i32 %d) nounwind {
 ;
 ; CHECKRV32ZDINX-LABEL: i32_select_fcmp_oeq:
 ; CHECKRV32ZDINX:       # %bb.0:
-; CHECKRV32ZDINX-NEXT:    feq.d a1, a0, a2
-; CHECKRV32ZDINX-NEXT:    mv a0, a4
-; CHECKRV32ZDINX-NEXT:    bnez a1, .LBB16_2
+; CHECKRV32ZDINX-NEXT:    feq.d a0, a0, a2
+; CHECKRV32ZDINX-NEXT:    bnez a0, .LBB16_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
-; CHECKRV32ZDINX-NEXT:    mv a0, a5
+; CHECKRV32ZDINX-NEXT:    mv a4, a5
 ; CHECKRV32ZDINX-NEXT:  .LBB16_2:
+; CHECKRV32ZDINX-NEXT:    mv a0, a4
 ; CHECKRV32ZDINX-NEXT:    ret
 ;
 ; CHECKRV64ZDINX-LABEL: i32_select_fcmp_oeq:
 ; CHECKRV64ZDINX:       # %bb.0:
-; CHECKRV64ZDINX-NEXT:    feq.d a1, a0, a1
-; CHECKRV64ZDINX-NEXT:    mv a0, a2
-; CHECKRV64ZDINX-NEXT:    bnez a1, .LBB16_2
+; CHECKRV64ZDINX-NEXT:    feq.d a0, a0, a1
+; CHECKRV64ZDINX-NEXT:    bnez a0, .LBB16_2
 ; CHECKRV64ZDINX-NEXT:  # %bb.1:
-; CHECKRV64ZDINX-NEXT:    mv a0, a3
+; CHECKRV64ZDINX-NEXT:    mv a2, a3
 ; CHECKRV64ZDINX-NEXT:  .LBB16_2:
+; CHECKRV64ZDINX-NEXT:    mv a0, a2
 ; CHECKRV64ZDINX-NEXT:    ret
   %1 = fcmp oeq double %a, %b
   %2 = select i1 %1, i32 %c, i32 %d
@@ -577,9 +577,9 @@ define i32 @select_fcmp_oeq_1_2(double %a, double %b) {
 ;
 ; CHECKRV32ZDINX-LABEL: select_fcmp_oeq_1_2:
 ; CHECKRV32ZDINX:       # %bb.0:
-; CHECKRV32ZDINX-NEXT:    li a4, 2
 ; CHECKRV32ZDINX-NEXT:    feq.d a0, a0, a2
-; CHECKRV32ZDINX-NEXT:    sub a0, a4, a0
+; CHECKRV32ZDINX-NEXT:    li a1, 2
+; CHECKRV32ZDINX-NEXT:    sub a0, a1, a0
 ; CHECKRV32ZDINX-NEXT:    ret
 ;
 ; CHECKRV64ZDINX-LABEL: select_fcmp_oeq_1_2:
diff --git a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
index 4ae912a34d337..4478e7b8c1724 100644
--- a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
+++ b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
@@ -39,9 +39,9 @@ define double @func(double %d, i32 %n) nounwind {
 ;
 ; RV64IFD-LABEL: func:
 ; RV64IFD:       # %bb.0: # %entry
-; RV64IFD-NEXT:    sext.w a2, a1
 ; RV64IFD-NEXT:    fmv.d.x fa5, a0
-; RV64IFD-NEXT:    beqz a2, .LBB0_2
+; RV64IFD-NEXT:    sext.w a0, a1
+; RV64IFD-NEXT:    beqz a0, .LBB0_2
 ; RV64IFD-NEXT:  # %bb.1: # %if.else
 ; RV64IFD-NEXT:    addi sp, sp, -16
 ; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/fastcc-bf16.ll b/llvm/test/CodeGen/RISCV/fastcc-bf16.ll
index 17356116081ff..91577b96de6ba 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-bf16.ll
@@ -23,6 +23,9 @@ define bfloat @caller(<32 x bfloat> %A) nounwind {
 ; CHECK-NEXT:    fmv.h.x fa2, a2
 ; CHECK-NEXT:    fmv.h.x fa3, a3
 ; CHECK-NEXT:    fmv.h.x fa4, a4
+; CHECK-NEXT:    fmv.h.x fa5, a5
+; CHECK-NEXT:    fmv.h.x fa6, a6
+; CHECK-NEXT:    fmv.h.x fa7, a7
 ; CHECK-NEXT:    flh ft0, 32(sp)
 ; CHECK-NEXT:    flh ft1, 36(sp)
 ; CHECK-NEXT:    flh ft2, 40(sp)
@@ -47,9 +50,6 @@ define bfloat @caller(<32 x bfloat> %A) nounwind {
 ; CHECK-NEXT:    flh fs9, 116(sp)
 ; CHECK-NEXT:    flh fs10, 120(sp)
 ; CHECK-NEXT:    flh fs11, 124(sp)
-; CHECK-NEXT:    fmv.h.x fa5, a5
-; CHECK-NEXT:    fmv.h.x fa6, a6
-; CHECK-NEXT:    fmv.h.x fa7, a7
 ; CHECK-NEXT:    fsh fs8, 16(sp)
 ; CHECK-NEXT:    fsh fs9, 18(sp)
 ; CHECK-NEXT:    fsh fs10, 20(sp)
diff --git a/llvm/test/CodeGen/RISCV/fastcc-float.ll b/llvm/test/CodeGen/RISCV/fastcc-float.ll
index 237a72d983de4..c1c5fc440d403 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-float.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-float.ll
@@ -23,6 +23,9 @@ define float @caller(<32 x float> %A) nounwind {
 ; CHECK-NEXT:    fmv.w.x fa2, a2
 ; CHECK-NEXT:    fmv.w.x fa3, a3
 ; CHECK-NEXT:    fmv.w.x fa4, a4
+; CHECK-NEXT:    fmv.w.x fa5, a5
+; CHECK-NEXT:    fmv.w.x fa6, a6
+; CHECK-NEXT:    fmv.w.x fa7, a7
 ; CHECK-NEXT:    flw ft0, 64(sp)
 ; CHECK-NEXT:    flw ft1, 68(sp)
 ; CHECK-NEXT:    flw ft2, 72(sp)
@@ -47,9 +50,6 @@ define float @caller(<32 x float> %A) nounwind {
 ; CHECK-NEXT:    flw fs9, 148(sp)
 ; CHECK-NEXT:    flw fs10, 152(sp)
 ; CHECK-NEXT:    flw fs11, 156(sp)
-; CHECK-NEXT:    fmv.w.x fa5, a5
-; CHECK-NEXT:    fmv.w.x fa6, a6
-; CHECK-NEXT:    fmv.w.x fa7, a7
 ; CHECK-NEXT:    fsw fs8, 32(sp)
 ; CHECK-NEXT:    fsw fs9, 36(sp)
 ; CHECK-NEXT:    fsw fs10, 40(sp)
diff --git a/llvm/test/CodeGen/RISCV/fastcc-half.ll b/llvm/test/CodeGen/RISCV/fastcc-half.ll
index bf8d4e8dcb98c..b5c3f7ef8d523 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-half.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-half.ll
@@ -23,6 +23,9 @@ define half @caller(<32 x half> %A) nounwind {
 ; CHECK-NEXT:    fmv.h.x fa2, a2
 ; CHECK-NEXT:    fmv.h.x fa3, a3
 ; CHECK-NEXT:    fmv.h.x fa4, a4
+; CHECK-NEXT:    fmv.h.x fa5, a5
+; CHECK-NEXT:    fmv.h.x fa6, a6
+; CHECK-NEXT:    fmv.h.x fa7, a7
 ; CHECK-NEXT:    flh ft0, 32(sp)
 ; CHECK-NEXT:    flh ft1, 36(sp)
 ; CHECK-NEXT:    flh ft2, 40(sp)
@@ -47,9 +50,6 @@ define half @caller(<32 x half> %A) nounwind {
 ; CHECK-NEXT:    flh fs9, 116(sp)
 ; CHECK-NEXT:    flh fs10, 120(sp)
 ; CHECK-NEXT:    flh fs11, 124(sp)
-; CHECK-NEXT:    fmv.h.x fa5, a5
-; CHECK-NEXT:    fmv.h.x fa6, a6
-; CHECK-NEXT:    fmv.h.x fa7, a7
 ; CHECK-NEXT:    fsh fs8, 16(sp)
 ; CHECK-NEXT:    fsh fs9, 18(sp)
 ; CHECK-NEXT:    fsh fs10, 20(sp)
diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
index 8a91c46bcdaff..beb0df5f292be 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
@@ -287,6 +287,7 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    mv a7, a3
 ; ZHINX32-NEXT:    mv a6, a2
 ; ZHINX32-NEXT:    mv a5, a1
+; ZHINX32-NEXT:    mv a4, a0
 ; ZHINX32-NEXT:    lh t3, 112(sp)
 ; ZHINX32-NEXT:    lh t4, 116(sp)
 ; ZHINX32-NEXT:    lh t5, 120(sp)
@@ -307,14 +308,14 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    lh s10, 180(sp)
 ; ZHINX32-NEXT:    lh s11, 184(sp)
 ; ZHINX32-NEXT:    lh ra, 188(sp)
-; ZHINX32-NEXT:    lh a1, 192(sp)
-; ZHINX32-NEXT:    lh a2, 196(sp)
-; ZHINX32-NEXT:    lh a3, 200(sp)
-; ZHINX32-NEXT:    lh a4, 204(sp)
-; ZHINX32-NEXT:    sh a1, 32(sp)
-; ZHINX32-NEXT:    sh a2, 34(sp)
-; ZHINX32-NEXT:    sh a3, 36(sp)
-; ZHINX32-NEXT:    sh a4, 38(sp)
+; ZHINX32-NEXT:    lh a0, 192(sp)
+; ZHINX32-NEXT:    lh a1, 196(sp)
+; ZHINX32-NEXT:    lh a2, 200(sp)
+; ZHINX32-NEXT:    lh a3, 204(sp)
+; ZHINX32-NEXT:    sh a0, 32(sp)
+; ZHINX32-NEXT:    sh a1, 34(sp)
+; ZHINX32-NEXT:    sh a2, 36(sp)
+; ZHINX32-NEXT:    sh a3, 38(sp)
 ; ZHINX32-NEXT:    sh s9, 24(sp)
 ; ZHINX32-NEXT:    sh s10, 26(sp)
 ; ZHINX32-NEXT:    sh s11, 28(sp)
@@ -331,6 +332,7 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sh t1, 2(sp)
 ; ZHINX32-NEXT:    sh t2, 4(sp)
 ; ZHINX32-NEXT:    sh s0, 6(sp)
+; ZHINX32-NEXT:    mv a0, a4
 ; ZHINX32-NEXT:    mv a1, a5
 ; ZHINX32-NEXT:    mv a2, a6
 ; ZHINX32-NEXT:    mv a3, a7
@@ -378,6 +380,7 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    mv a7, a3
 ; ZHINX64-NEXT:    mv a6, a2
 ; ZHINX64-NEXT:    mv a5, a1
+; ZHINX64-NEXT:    mv a4, a0
 ; ZHINX64-NEXT:    lh t3, 160(sp)
 ; ZHINX64-NEXT:    lh t4, 168(sp)
 ; ZHINX64-NEXT:    lh t5, 176(sp)
@@ -398,14 +401,14 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    lh s10, 296(sp)
 ; ZHINX64-NEXT:    lh s11, 304(sp)
 ; ZHINX64-NEXT:    lh ra, 312(sp)
-; ZHINX64-NEXT:    lh a1, 320(sp)
-; ZHINX64-NEXT:    lh a2, 328(sp)
-; ZHINX64-NEXT:    lh a3, 336(sp)
-; ZHINX64-NEXT:    lh a4, 344(sp)
-; ZHINX64-NEXT:    sh a1, 32(sp)
-; ZHINX64-NEXT:    sh a2, 34(sp)
-; ZHINX64-NEXT:    sh a3, 36(sp)
-; ZHINX64-NEXT:    sh a4, 38(sp)
+; ZHINX64-NEXT:    lh a0, 320(sp)
+; ZHINX64-NEXT:    lh a1, 328(sp)
+; ZHINX64-NEXT:    lh a2, 336(sp)
+; ZHINX64-NEXT:    lh a3, 344(sp)
+; ZHINX64-NEXT:    sh a0, 32(sp)
+; ZHINX64-NEXT:    sh a1, 34(sp)
+; ZHINX64-NEXT:    sh a2, 36(sp)
+; ZHINX64-NEXT:    sh a3, 38(sp)
 ; ZHINX64-NEXT:    sh s9, 24(sp)
 ; ZHINX64-NEXT:    sh s10, 26(sp)
 ; ZHINX64-NEXT:    sh s11, 28(sp)
@@ -422,6 +425,7 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sh t1, 2(sp)
 ; ZHINX64-NEXT:    sh t2, 4(sp)
 ; ZHINX64-NEXT:    sh s0, 6(sp)
+; ZHINX64-NEXT:    mv a0, a4
 ; ZHINX64-NEXT:    mv a1, a5
 ; ZHINX64-NEXT:    mv a2, a6
 ; ZHINX64-NEXT:    mv a3, a7
@@ -893,6 +897,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    mv a7, a3
 ; ZHINX32-NEXT:    mv a6, a2
 ; ZHINX32-NEXT:    mv a5, a1
+; ZHINX32-NEXT:    mv a4, a0
 ; ZHINX32-NEXT:    lw t3, 160(sp)
 ; ZHINX32-NEXT:    lw t4, 164(sp)
 ; ZHINX32-NEXT:    lw t5, 168(sp)
@@ -913,14 +918,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    lw s10, 228(sp)
 ; ZHINX32-NEXT:    lw s11, 232(sp)
 ; ZHINX32-NEXT:    lw ra, 236(sp)
-; ZHINX32-NEXT:    lw a1, 240(sp)
-; ZHINX32-NEXT:    lw a2, 244(sp)
-; ZHINX32-NEXT:    lw a3, 248(sp)
-; ZHINX32-NEXT:    lw a4, 252(sp)
-; ZHINX32-NEXT:    sw a1, 64(sp)
-; ZHINX32-NEXT:    sw a2, 68(sp)
-; ZHINX32-NEXT:    sw a3, 72(sp)
-; ZHINX32-NEXT:    sw a4, 76(sp)
+; ZHINX32-NEXT:    lw a0, 240(sp)
+; ZHINX32-NEXT:    lw a1, 244(sp)
+; ZHINX32-NEXT:    lw a2, 248(sp)
+; ZHINX32-NEXT:    lw a3, 252(sp)
+; ZHINX32-NEXT:    sw a0, 64(sp)
+; ZHINX32-NEXT:    sw a1, 68(sp)
+; ZHINX32-NEXT:    sw a2, 72(sp)
+; ZHINX32-NEXT:    sw a3, 76(sp)
 ; ZHINX32-NEXT:    sw s9, 48(sp)
 ; ZHINX32-NEXT:    sw s10, 52(sp)
 ; ZHINX32-NEXT:    sw s11, 56(sp)
@@ -937,6 +942,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    sw t1, 4(sp)
 ; ZHINX32-NEXT:    sw t2, 8(sp)
 ; ZHINX32-NEXT:    sw s0, 12(sp)
+; ZHINX32-NEXT:    mv a0, a4
 ; ZHINX32-NEXT:    mv a1, a5
 ; ZHINX32-NEXT:    mv a2, a6
 ; ZHINX32-NEXT:    mv a3, a7
@@ -984,6 +990,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    mv a7, a3
 ; ZHINX64-NEXT:    mv a6, a2
 ; ZHINX64-NEXT:    mv a5, a1
+; ZHINX64-NEXT:    mv a4, a0
 ; ZHINX64-NEXT:    lw t3, 208(sp)
 ; ZHINX64-NEXT:    lw t4, 216(sp)
 ; ZHINX64-NEXT:    lw t5, 224(sp)
@@ -1004,14 +1011,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    lw s10, 344(sp)
 ; ZHINX64-NEXT:    lw s11, 352(sp)
 ; ZHINX64-NEXT:    lw ra, 360(sp)
-; ZHINX64-NEXT:    lw a1, 368(sp)
-; ZHINX64-NEXT:    lw a2, 376(sp)
-; ZHINX64-NEXT:    lw a3, 384(sp)
-; ZHINX64-NEXT:    lw a4, 392(sp)
-; ZHINX64-NEXT:    sw a1, 64(sp)
-; ZHINX64-NEXT:    sw a2, 68(sp)
-; ZHINX64-NEXT:    sw a3, 72(sp)
-; ZHINX64-NEXT:    sw a4, 76(sp)
+; ZHINX64-NEXT:    lw a0, 368(sp)
+; ZHINX64-NEXT:    lw a1, 376(sp)
+; ZHINX64-NEXT:    lw a2, 384(sp)
+; ZHINX64-NEXT:    lw a3, 392(sp)
+; ZHINX64-NEXT:    sw a0, 64(sp)
+; ZHINX64-NEXT:    sw a1, 68(sp)
+; ZHINX64-NEXT:    sw a2, 72(sp)
+; ZHINX64-NEXT:    sw a3, 76(sp)
 ; ZHINX64-NEXT:    sw s9, 48(sp)
 ; ZHINX64-NEXT:    sw s10, 52(sp)
 ; ZHINX64-NEXT:    sw s11, 56(sp)
@@ -1028,6 +1035,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    sw t1, 4(sp)
 ; ZHINX64-NEXT:    sw t2, 8(sp)
 ; ZHINX64-NEXT:    sw s0, 12(sp)
+; ZHINX64-NEXT:    mv a0, a4
 ; ZHINX64-NEXT:    mv a1, a5
 ; ZHINX64-NEXT:    mv a2, a6
 ; ZHINX64-NEXT:    mv a3, a7
@@ -1075,6 +1083,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    mv a7, a3
 ; ZFINX32-NEXT:    mv a6, a2
 ; ZFINX32-NEXT:    mv a5, a1
+; ZFINX32-NEXT:    mv a4, a0
 ; ZFINX32-NEXT:    lw t3, 160(sp)
 ; ZFINX32-NEXT:    lw t4, 164(sp)
 ; ZFINX32-NEXT:    lw t5, 168(sp)
@@ -1095,14 +1104,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    lw s10, 228(sp)
 ; ZFINX32-NEXT:    lw s11, 232(sp)
 ; ZFINX32-NEXT:    lw ra, 236(sp)
-; ZFINX32-NEXT:    lw a1, 240(sp)
-; ZFINX32-NEXT:    lw a2, 244(sp)
-; ZFINX32-NEXT:    lw a3, 248(sp)
-; ZFINX32-NEXT:    lw a4, 252(sp)
-; ZFINX32-NEXT:    sw a1, 64(sp)
-; ZFINX32-NEXT:    sw a2, 68(sp)
-; ZFINX32-NEXT:    sw a3, 72(sp)
-; ZFINX32-NEXT:    sw a4, 76(sp)
+; ZFINX32-NEXT:    lw a0, 240(sp)
+; ZFINX32-NEXT:    lw a1, 244(sp)
+; ZFINX32-NEXT:    lw a2, 248(sp)
+; ZFINX32-NEXT:    lw a3, 252(sp)
+; ZFINX32-NEXT:    sw a0, 64(sp)
+; ZFINX32-NEXT:    sw a1, 68(sp)
+; ZFINX32-NEXT:    sw a2, 72(sp)
+; ZFINX32-NEXT:    sw a3, 76(sp)
 ; ZFINX32-NEXT:    sw s9, 48(sp)
 ; ZFINX32-NEXT:    sw s10, 52(sp)
 ; ZFINX32-NEXT:    sw s11, 56(sp)
@@ -1119,6 +1128,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    sw t1, 4(sp)
 ; ZFINX32-NEXT:    sw t2, 8(sp)
 ; ZFINX32-NEXT:    sw s0, 12(sp)
+; ZFINX32-NEXT:    mv a0, a4
 ; ZFINX32-NEXT:    mv a1, a5
 ; ZFINX32-NEXT:    mv a2, a6
 ; ZFINX32-NEXT:    mv a3, a7
@@ -1166,6 +1176,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    mv a7, a3
 ; ZFINX64-NEXT:    mv a6, a2
 ; ZFINX64-NEXT:    mv a5, a1
+; ZFINX64-NEXT:    mv a4, a0
 ; ZFINX64-NEXT:    lw t3, 208(sp)
 ; ZFINX64-NEXT:    lw t4, 216(sp)
 ; ZFINX64-NEXT:    lw t5, 224(sp)
@@ -1186,14 +1197,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    lw s10, 344(sp)
 ; ZFINX64-NEXT:    lw s11, 352(sp)
 ; ZFINX64-NEXT:    lw ra, 360(sp)
-; ZFINX64-NEXT:    lw a1, 368(sp)
-; ZFINX64-NEXT:    lw a2, 376(sp)
-; ZFINX64-NEXT:    lw a3, 384(sp)
-; ZFINX64-NEXT:    lw a4, 392(sp)
-; ZFINX64-NEXT:    sw a1, 64(sp)
-; ZFINX64-NEXT:    sw a2, 68(sp)
-; ZFINX64-NEXT:    sw a3, 72(sp)
-; ZFINX64-NEXT:    sw a4, 76(sp)
+; ZFINX64-NEXT:    lw a0, 368(sp)
+; ZFINX64-NEXT:    lw a1, 376(sp)
+; ZFINX64-NEXT:    lw a2, 384(sp)
+; ZFINX64-NEXT:    lw a3, 392(sp)
+; ZFINX64-NEXT:    sw a0, 64(sp)
+; ZFINX64-NEXT:    sw a1, 68(sp)
+; ZFINX64-NEXT:    sw a2, 72(sp)
+; ZFINX64-NEXT:    sw a3, 76(sp)
 ; ZFINX64-NEXT:    sw s9, 48(sp)
 ; ZFINX64-NEXT:    sw s10, 52(sp)
 ; ZFINX64-NEXT:    sw s11, 56(sp)
@@ -1210,6 +1221,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    sw t1, 4(sp)
 ; ZFINX64-NEXT:    sw t2, 8(sp)
 ; ZFINX64-NEXT:    sw s0, 12(sp)
+; ZFINX64-NEXT:    mv a0, a4
 ; ZFINX64-NEXT:    mv a1, a5
 ; ZFINX64-NEXT:    mv a2, a6
 ; ZFINX64-NEXT:    mv a3, a7
@@ -1257,6 +1269,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    mv a7, a3
 ; ZDINX32-NEXT:    mv a6, a2
 ; ZDINX32-NEXT:    mv a5, a1
+; ZDINX32-NEXT:    mv a4, a0
 ; ZDINX32-NEXT:    lw t3, 160(sp)
 ; ZDINX32-NEXT:    lw t4, 164(sp)
 ; ZDINX32-NEXT:    lw t5, 168(sp)
@@ -1277,14 +1290,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    lw s10, 228(sp)
 ; ZDINX32-NEXT:    lw s11, 232(sp)
 ; ZDINX32-NEXT:    lw ra, 236(sp)
-; ZDINX32-NEXT:    lw a1, 240(sp)
-; ZDINX32-NEXT:    lw a2, 244(sp)
-; ZDINX32-NEXT:    lw a3, 248(sp)
-; ZDINX32-NEXT:    lw a4, 252(sp)
-; ZDINX32-NEXT:    sw a1, 64(sp)
-; ZDINX32-NEXT:    sw a2, 68(sp)
-; ZDINX32-NEXT:    sw a3, 72(sp)
-; ZDINX32-NEXT:    sw a4, 76(sp)
+; ZDINX32-NEXT:    lw a0, 240(sp)
+; ZDINX32-NEXT:    lw a1, 244(sp)
+; ZDINX32-NEXT:    lw a2, 248(sp)
+; ZDINX32-NEXT:    lw a3, 252(sp)
+; ZDINX32-NEXT:    sw a0, 64(sp)
+; ZDINX32-NEXT:    sw a1, 68(sp)
+; ZDINX32-NEXT:    sw a2, 72(sp)
+; ZDINX32-NEXT:    sw a3, 76(sp)
 ; ZDINX32-NEXT:    sw s9, 48(sp)
 ; ZDINX32-NEXT:    sw s10, 52(sp)
 ; ZDINX32-NEXT:    sw s11, 56(sp)
@@ -1301,6 +1314,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    sw t1, 4(sp)
 ; ZDINX32-NEXT:    sw t2, 8(sp)
 ; ZDINX32-NEXT:    sw s0, 12(sp)
+; ZDINX32-NEXT:    mv a0, a4
 ; ZDINX32-NEXT:    mv a1, a5
 ; ZDINX32-NEXT:    mv a2, a6
 ; ZDINX32-NEXT:    mv a3, a7
@@ -1348,6 +1362,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    mv a7, a3
 ; ZDINX64-NEXT:    mv a6, a2
 ; ZDINX64-NEXT:    mv a5, a1
+; ZDINX64-NEXT:    mv a4, a0
 ; ZDINX64-NEXT:    lw t3, 208(sp)
 ; ZDINX64-NEXT:    lw t4, 216(sp)
 ; ZDINX64-NEXT:    lw t5, 224(sp)
@@ -1368,14 +1383,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    lw s10, 344(sp)
 ; ZDINX64-NEXT:    lw s11, 352(sp)
 ; ZDINX64-NEXT:    lw ra, 360(sp)
-; ZDINX64-NEXT:    lw a1, 368(sp)
-; ZDINX64-NEXT:    lw a2, 376(sp)
-; ZDINX64-NEXT:    lw a3, 384(sp)
-; ZDINX64-NEXT:    lw a4, 392(sp)
-; ZDINX64-NEXT:    sw a1, 64(sp)
-; ZDINX64-NEXT:    sw a2, 68(sp)
-; ZDINX64-NEXT:    sw a3, 72(sp)
-; ZDINX64-NEXT:    sw a4, 76(sp)
+; ZDINX64-NEXT:    lw a0, 368(sp)
+; ZDINX64-NEXT:    lw a1, 376(sp)
+; ZDINX64-NEXT:    lw a2, 384(sp)
+; ZDINX64-NEXT:    lw a3, 392(sp)
+; ZDINX64-NEXT:    sw a0, 64(sp)
+; ZDINX64-NEXT:    sw a1, 68(sp)
+; ZDINX64-NEXT:    sw a2, 72(sp)
+; ZDINX64-NEXT:    sw a3, 76(sp)
 ; ZDINX64-NEXT:    sw s9, 48(sp)
 ; ZDINX64-NEXT:    sw s10, 52(sp)
 ; ZDINX64-NEXT:    sw s11, 56(sp)
@@ -1392,6 +1407,7 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    sw t1, 4(sp)
 ; ZDINX64-NEXT:    sw t2, 8(sp)
 ; ZDINX64-NEXT:    sw s0, 12(sp)
+; ZDINX64-NEXT:    mv a0, a4
 ; ZDINX64-NEXT:    mv a1, a5
 ; ZDINX64-NEXT:    mv a2, a6
 ; ZDINX64-NEXT:    mv a3, a7
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index fc866d71a3a70..89858af3282d6 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -1417,13 +1417,13 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_w_s_sat_i16:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    feq.s a0, fa0, fa0
-; RV32IF-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI24_0)(a1)
 ; RV32IF-NEXT:    lui a1, 815104
-; RV32IF-NEXT:    fmv.w.x fa4, a1
-; RV32IF-NEXT:    fmax.s fa4, fa0, fa4
+; RV32IF-NEXT:    fmv.w.x fa5, a1
+; RV32IF-NEXT:    lui a1, %hi(.LCPI24_0)
 ; RV32IF-NEXT:    neg a0, a0
-; RV32IF-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
+; RV32IF-NEXT:    flw fa4, %lo(.LCPI24_0)(a1)
+; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IF-NEXT:    and a0, a0, a1
 ; RV32IF-NEXT:    ret
@@ -1431,13 +1431,13 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_w_s_sat_i16:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    feq.s a0, fa0, fa0
-; RV64IF-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV64IF-NEXT:    flw fa5, %lo(.LCPI24_0)(a1)
 ; RV64IF-NEXT:    lui a1, 815104
-; RV64IF-NEXT:    fmv.w.x fa4, a1
-; RV64IF-NEXT:    fmax.s fa4, fa0, fa4
+; RV64IF-NEXT:    fmv.w.x fa5, a1
+; RV64IF-NEXT:    lui a1, %hi(.LCPI24_0)
 ; RV64IF-NEXT:    neg a0, a0
-; RV64IF-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
+; RV64IF-NEXT:    flw fa4, %lo(.LCPI24_0)(a1)
+; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IF-NEXT:    and a0, a0, a1
 ; RV64IF-NEXT:    ret
@@ -1602,21 +1602,21 @@ define zeroext i16 @fcvt_wu_s_i16(float %a) nounwind {
 define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IF:       # %bb.0: # %start
+; RV32IF-NEXT:    fmv.w.x fa5, zero
 ; RV32IF-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI26_0)(a0)
-; RV32IF-NEXT:    fmv.w.x fa4, zero
-; RV32IF-NEXT:    fmax.s fa4, fa0, fa4
-; RV32IF-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
+; RV32IF-NEXT:    flw fa4, %lo(.LCPI26_0)(a0)
+; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32IF-NEXT:    ret
 ;
 ; RV64IF-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IF:       # %bb.0: # %start
+; RV64IF-NEXT:    fmv.w.x fa5, zero
 ; RV64IF-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV64IF-NEXT:    flw fa5, %lo(.LCPI26_0)(a0)
-; RV64IF-NEXT:    fmv.w.x fa4, zero
-; RV64IF-NEXT:    fmax.s fa4, fa0, fa4
-; RV64IF-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
+; RV64IF-NEXT:    flw fa4, %lo(.LCPI26_0)(a0)
+; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64IF-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
index 0cbfc96bf485e..9b3a643e59e68 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
@@ -234,8 +234,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    flt.s a2, fa1, fa0
 ; CHECKIF-NEXT:    fsflags a0
-; CHECKIF-NEXT:    or a0, a2, a1
 ; CHECKIF-NEXT:    feq.s zero, fa1, fa0
+; CHECKIF-NEXT:    or a0, a2, a1
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_one:
@@ -247,9 +247,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    flt.s a4, a1, a0
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    or a2, a4, a3
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    or a0, a4, a3
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_one:
@@ -353,9 +352,9 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    flt.s a2, fa1, fa0
 ; CHECKIF-NEXT:    fsflags a0
+; CHECKIF-NEXT:    feq.s zero, fa1, fa0
 ; CHECKIF-NEXT:    or a1, a2, a1
 ; CHECKIF-NEXT:    xori a0, a1, 1
-; CHECKIF-NEXT:    feq.s zero, fa1, fa0
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_ueq:
@@ -367,10 +366,9 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    flt.s a4, a1, a0
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    or a3, a4, a3
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    or a3, a4, a3
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ueq:
@@ -429,8 +427,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    fle.s a1, fa0, fa1
 ; CHECKIF-NEXT:    fsflags a0
-; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    feq.s zero, fa0, fa1
+; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_ugt:
@@ -438,9 +436,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    fle.s a3, a0, a1
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a0, a1
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ugt:
@@ -473,8 +470,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    flt.s a1, fa0, fa1
 ; CHECKIF-NEXT:    fsflags a0
-; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    feq.s zero, fa0, fa1
+; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_uge:
@@ -482,9 +479,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    flt.s a3, a0, a1
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a0, a1
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_uge:
@@ -519,8 +515,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    fle.s a1, fa1, fa0
 ; CHECKIF-NEXT:    fsflags a0
-; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    feq.s zero, fa1, fa0
+; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_ult:
@@ -528,9 +524,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    fle.s a3, a1, a0
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ult:
@@ -563,8 +558,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp {
 ; CHECKIF-NEXT:    frflags a0
 ; CHECKIF-NEXT:    flt.s a1, fa1, fa0
 ; CHECKIF-NEXT:    fsflags a0
-; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    feq.s zero, fa1, fa0
+; CHECKIF-NEXT:    xori a0, a1, 1
 ; CHECKIF-NEXT:    ret
 ;
 ; CHECKIZFINX-LABEL: fcmp_ule:
@@ -572,9 +567,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp {
 ; CHECKIZFINX-NEXT:    frflags a2
 ; CHECKIZFINX-NEXT:    flt.s a3, a1, a0
 ; CHECKIZFINX-NEXT:    fsflags a2
-; CHECKIZFINX-NEXT:    xori a2, a3, 1
 ; CHECKIZFINX-NEXT:    feq.s zero, a1, a0
-; CHECKIZFINX-NEXT:    mv a0, a2
+; CHECKIZFINX-NEXT:    xori a0, a3, 1
 ; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcmp_ule:
diff --git a/llvm/test/CodeGen/RISCV/float-select-fcmp.ll b/llvm/test/CodeGen/RISCV/float-select-fcmp.ll
index a2ff0d33e2d31..5ec0335972394 100644
--- a/llvm/test/CodeGen/RISCV/float-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/float-select-fcmp.ll
@@ -387,12 +387,12 @@ define i32 @i32_select_fcmp_oeq(float %a, float %b, i32 %c, i32 %d) nounwind {
 ;
 ; CHECKZFINX-LABEL: i32_select_fcmp_oeq:
 ; CHECKZFINX:       # %bb.0:
-; CHECKZFINX-NEXT:    feq.s a1, a0, a1
-; CHECKZFINX-NEXT:    mv a0, a2
-; CHECKZFINX-NEXT:    bnez a1, .LBB16_2
+; CHECKZFINX-NEXT:    feq.s a0, a0, a1
+; CHECKZFINX-NEXT:    bnez a0, .LBB16_2
 ; CHECKZFINX-NEXT:  # %bb.1:
-; CHECKZFINX-NEXT:    mv a0, a3
+; CHECKZFINX-NEXT:    mv a2, a3
 ; CHECKZFINX-NEXT:  .LBB16_2:
+; CHECKZFINX-NEXT:    mv a0, a2
 ; CHECKZFINX-NEXT:    ret
   %1 = fcmp oeq float %a, %b
   %2 = select i1 %1, i32 %c, i32 %d
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index b8dc7804c4908..59ba3652c89e9 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -929,19 +929,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call f
+; RV32I-NEXT:    addi s5, s5, 1
+; RV32I-NEXT:    seqz a0, s5
+; RV32I-NEXT:    add s6, s6, a0
 ; RV32I-NEXT:    lw a0, 8(s7)
 ; RV32I-NEXT:    lw a1, 12(s7)
-; RV32I-NEXT:    addi s5, s5, 1
-; RV32I-NEXT:    seqz a2, s5
-; RV32I-NEXT:    add s6, s6, a2
-; RV32I-NEXT:    xor a2, s5, s2
 ; RV32I-NEXT:    add a1, a1, s4
-; RV32I-NEXT:    xor a3, s6, s1
-; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    xor a2, s5, s2
 ; RV32I-NEXT:    add s3, a0, s3
 ; RV32I-NEXT:    sltu s4, s3, a0
 ; RV32I-NEXT:    add s4, a1, s4
-; RV32I-NEXT:    bnez a2, .LBB20_5
+; RV32I-NEXT:    xor a0, s6, s1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    bnez a0, .LBB20_5
 ; RV32I-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    mv a1, s4
@@ -994,19 +994,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-MEDIUM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-MEDIUM-NEXT:    mv a0, s0
 ; RV32I-MEDIUM-NEXT:    call f
+; RV32I-MEDIUM-NEXT:    addi s5, s5, 1
+; RV32I-MEDIUM-NEXT:    seqz a0, s5
+; RV32I-MEDIUM-NEXT:    add s6, s6, a0
 ; RV32I-MEDIUM-NEXT:    lw a0, 8(s7)
 ; RV32I-MEDIUM-NEXT:    lw a1, 12(s7)
-; RV32I-MEDIUM-NEXT:    addi s5, s5, 1
-; RV32I-MEDIUM-NEXT:    seqz a2, s5
-; RV32I-MEDIUM-NEXT:    add s6, s6, a2
-; RV32I-MEDIUM-NEXT:    xor a2, s5, s2
 ; RV32I-MEDIUM-NEXT:    add a1, a1, s4
-; RV32I-MEDIUM-NEXT:    xor a3, s6, s1
-; RV32I-MEDIUM-NEXT:    or a2, a2, a3
+; RV32I-MEDIUM-NEXT:    xor a2, s5, s2
 ; RV32I-MEDIUM-NEXT:    add s3, a0, s3
 ; RV32I-MEDIUM-NEXT:    sltu s4, s3, a0
 ; RV32I-MEDIUM-NEXT:    add s4, a1, s4
-; RV32I-MEDIUM-NEXT:    bnez a2, .LBB20_5
+; RV32I-MEDIUM-NEXT:    xor a0, s6, s1
+; RV32I-MEDIUM-NEXT:    or a0, a2, a0
+; RV32I-MEDIUM-NEXT:    bnez a0, .LBB20_5
 ; RV32I-MEDIUM-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-MEDIUM-NEXT:    mv a0, s3
 ; RV32I-MEDIUM-NEXT:    mv a1, s4
@@ -1042,8 +1042,8 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call f
 ; RV64I-NEXT:    ld a0, 8(s3)
-; RV64I-NEXT:    addi s1, s1, -1
 ; RV64I-NEXT:    add s2, a0, s2
+; RV64I-NEXT:    addi s1, s1, -1
 ; RV64I-NEXT:    bnez s1, .LBB20_2
 ; RV64I-NEXT:    j .LBB20_4
 ; RV64I-NEXT:  .LBB20_3:
@@ -1078,8 +1078,8 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV64I-MEDIUM-NEXT:    mv a0, s0
 ; RV64I-MEDIUM-NEXT:    call f
 ; RV64I-MEDIUM-NEXT:    ld a0, 8(s3)
-; RV64I-MEDIUM-NEXT:    addi s1, s1, -1
 ; RV64I-MEDIUM-NEXT:    add s2, a0, s2
+; RV64I-MEDIUM-NEXT:    addi s1, s1, -1
 ; RV64I-MEDIUM-NEXT:    bnez s1, .LBB20_2
 ; RV64I-MEDIUM-NEXT:    j .LBB20_4
 ; RV64I-MEDIUM-NEXT:  .LBB20_3:
@@ -1108,18 +1108,18 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV64I-LARGE-NEXT:    mv s0, a2
 ; RV64I-LARGE-NEXT:    mv s1, a1
 ; RV64I-LARGE-NEXT:    li s2, 0
+; RV64I-LARGE-NEXT:    slli a0, a0, 4
 ; RV64I-LARGE-NEXT:  .Lpcrel_hi14:
 ; RV64I-LARGE-NEXT:    auipc a1, %pcrel_hi(.LCPI20_0)
-; RV64I-LARGE-NEXT:    ld s3, %pcrel_lo(.Lpcrel_hi14)(a1)
-; RV64I-LARGE-NEXT:    slli a0, a0, 4
-; RV64I-LARGE-NEXT:    add s4, a2, a0
+; RV64I-LARGE-NEXT:    add s3, a2, a0
+; RV64I-LARGE-NEXT:    ld s4, %pcrel_lo(.Lpcrel_hi14)(a1)
 ; RV64I-LARGE-NEXT:  .LBB20_2: # %for.body
 ; RV64I-LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-LARGE-NEXT:    mv a0, s0
-; RV64I-LARGE-NEXT:    jalr s3
-; RV64I-LARGE-NEXT:    ld a0, 8(s4)
-; RV64I-LARGE-NEXT:    addi s1, s1, -1
+; RV64I-LARGE-NEXT:    jalr s4
+; RV64I-LARGE-NEXT:    ld a0, 8(s3)
 ; RV64I-LARGE-NEXT:    add s2, a0, s2
+; RV64I-LARGE-NEXT:    addi s1, s1, -1
 ; RV64I-LARGE-NEXT:    bnez s1, .LBB20_2
 ; RV64I-LARGE-NEXT:    j .LBB20_4
 ; RV64I-LARGE-NEXT:  .LBB20_3:
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index e7719dc70660b..3ea9c4c6ad754 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -1357,28 +1357,28 @@ define i32 @rmw32_max_seq_cst(ptr %p) nounwind {
 ; RV32-NO-ATOMIC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    mv s0, a0
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV32-NO-ATOMIC-NEXT:    j .LBB23_2
 ; RV32-NO-ATOMIC-NEXT:  .LBB23_1: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB23_2 Depth=1
-; RV32-NO-ATOMIC-NEXT:    sw a1, 4(sp)
+; RV32-NO-ATOMIC-NEXT:    sw a0, 4(sp)
 ; RV32-NO-ATOMIC-NEXT:    addi a1, sp, 4
 ; RV32-NO-ATOMIC-NEXT:    li a3, 5
 ; RV32-NO-ATOMIC-NEXT:    li a4, 5
 ; RV32-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV32-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV32-NO-ATOMIC-NEXT:    lw a1, 4(sp)
-; RV32-NO-ATOMIC-NEXT:    bnez a0, .LBB23_4
+; RV32-NO-ATOMIC-NEXT:    mv a1, a0
+; RV32-NO-ATOMIC-NEXT:    lw a0, 4(sp)
+; RV32-NO-ATOMIC-NEXT:    bnez a1, .LBB23_4
 ; RV32-NO-ATOMIC-NEXT:  .LBB23_2: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NO-ATOMIC-NEXT:    mv a2, a1
-; RV32-NO-ATOMIC-NEXT:    bgtz a1, .LBB23_1
+; RV32-NO-ATOMIC-NEXT:    mv a2, a0
+; RV32-NO-ATOMIC-NEXT:    bgtz a0, .LBB23_1
 ; RV32-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB23_2 Depth=1
 ; RV32-NO-ATOMIC-NEXT:    li a2, 1
 ; RV32-NO-ATOMIC-NEXT:    j .LBB23_1
 ; RV32-NO-ATOMIC-NEXT:  .LBB23_4: # %atomicrmw.end
-; RV32-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV32-NO-ATOMIC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    addi sp, sp, 16
@@ -1410,29 +1410,29 @@ define i32 @rmw32_max_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    j .LBB23_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB23_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB23_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sw a1, 12(sp)
+; RV64-NO-ATOMIC-NEXT:    sw a0, 12(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 12
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV64-NO-ATOMIC-NEXT:    lw a1, 12(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB23_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    lw a0, 12(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB23_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB23_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    li a0, 1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    blt a0, a1, .LBB23_1
+; RV64-NO-ATOMIC-NEXT:    li a1, 1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    blt a1, a0, .LBB23_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB23_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB23_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB23_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    addi sp, sp, 32
@@ -1469,29 +1469,29 @@ define i32 @rmw32_min_seq_cst(ptr %p) nounwind {
 ; RV32-NO-ATOMIC-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    mv s0, a0
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV32-NO-ATOMIC-NEXT:    li s1, 2
 ; RV32-NO-ATOMIC-NEXT:    j .LBB24_2
 ; RV32-NO-ATOMIC-NEXT:  .LBB24_1: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB24_2 Depth=1
-; RV32-NO-ATOMIC-NEXT:    sw a1, 0(sp)
+; RV32-NO-ATOMIC-NEXT:    sw a0, 0(sp)
 ; RV32-NO-ATOMIC-NEXT:    mv a1, sp
 ; RV32-NO-ATOMIC-NEXT:    li a3, 5
 ; RV32-NO-ATOMIC-NEXT:    li a4, 5
 ; RV32-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV32-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(sp)
-; RV32-NO-ATOMIC-NEXT:    bnez a0, .LBB24_4
+; RV32-NO-ATOMIC-NEXT:    mv a1, a0
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(sp)
+; RV32-NO-ATOMIC-NEXT:    bnez a1, .LBB24_4
 ; RV32-NO-ATOMIC-NEXT:  .LBB24_2: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NO-ATOMIC-NEXT:    mv a2, a1
-; RV32-NO-ATOMIC-NEXT:    blt a1, s1, .LBB24_1
+; RV32-NO-ATOMIC-NEXT:    mv a2, a0
+; RV32-NO-ATOMIC-NEXT:    blt a0, s1, .LBB24_1
 ; RV32-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB24_2 Depth=1
 ; RV32-NO-ATOMIC-NEXT:    li a2, 1
 ; RV32-NO-ATOMIC-NEXT:    j .LBB24_1
 ; RV32-NO-ATOMIC-NEXT:  .LBB24_4: # %atomicrmw.end
-; RV32-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV32-NO-ATOMIC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1525,29 +1525,29 @@ define i32 @rmw32_min_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    li s1, 2
 ; RV64-NO-ATOMIC-NEXT:    j .LBB24_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB24_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB24_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sw a1, 4(sp)
+; RV64-NO-ATOMIC-NEXT:    sw a0, 4(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 4
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV64-NO-ATOMIC-NEXT:    lw a1, 4(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB24_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    lw a0, 4(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB24_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB24_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    blt a1, s1, .LBB24_1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    blt a0, s1, .LBB24_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB24_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB24_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB24_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -1584,21 +1584,21 @@ define i32 @rmw32_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NO-ATOMIC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    mv s0, a0
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV32-NO-ATOMIC-NEXT:  .LBB25_1: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NO-ATOMIC-NEXT:    seqz a2, a1
-; RV32-NO-ATOMIC-NEXT:    add a2, a1, a2
-; RV32-NO-ATOMIC-NEXT:    sw a1, 4(sp)
+; RV32-NO-ATOMIC-NEXT:    seqz a2, a0
+; RV32-NO-ATOMIC-NEXT:    add a2, a0, a2
+; RV32-NO-ATOMIC-NEXT:    sw a0, 4(sp)
 ; RV32-NO-ATOMIC-NEXT:    addi a1, sp, 4
 ; RV32-NO-ATOMIC-NEXT:    li a3, 5
 ; RV32-NO-ATOMIC-NEXT:    li a4, 5
 ; RV32-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV32-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV32-NO-ATOMIC-NEXT:    lw a1, 4(sp)
-; RV32-NO-ATOMIC-NEXT:    beqz a0, .LBB25_1
+; RV32-NO-ATOMIC-NEXT:    mv a1, a0
+; RV32-NO-ATOMIC-NEXT:    lw a0, 4(sp)
+; RV32-NO-ATOMIC-NEXT:    beqz a1, .LBB25_1
 ; RV32-NO-ATOMIC-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV32-NO-ATOMIC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    addi sp, sp, 16
@@ -1630,29 +1630,29 @@ define i32 @rmw32_umax_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    j .LBB25_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB25_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB25_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sw a1, 12(sp)
+; RV64-NO-ATOMIC-NEXT:    sw a0, 12(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 12
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV64-NO-ATOMIC-NEXT:    lw a1, 12(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB25_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    lw a0, 12(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB25_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB25_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    li a0, 1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    bltu a0, a1, .LBB25_1
+; RV64-NO-ATOMIC-NEXT:    li a1, 1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    bltu a1, a0, .LBB25_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB25_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB25_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB25_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    addi sp, sp, 32
@@ -1689,29 +1689,29 @@ define i32 @rmw32_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NO-ATOMIC-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32-NO-ATOMIC-NEXT:    mv s0, a0
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV32-NO-ATOMIC-NEXT:    li s1, 2
 ; RV32-NO-ATOMIC-NEXT:    j .LBB26_2
 ; RV32-NO-ATOMIC-NEXT:  .LBB26_1: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB26_2 Depth=1
-; RV32-NO-ATOMIC-NEXT:    sw a1, 0(sp)
+; RV32-NO-ATOMIC-NEXT:    sw a0, 0(sp)
 ; RV32-NO-ATOMIC-NEXT:    mv a1, sp
 ; RV32-NO-ATOMIC-NEXT:    li a3, 5
 ; RV32-NO-ATOMIC-NEXT:    li a4, 5
 ; RV32-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV32-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV32-NO-ATOMIC-NEXT:    lw a1, 0(sp)
-; RV32-NO-ATOMIC-NEXT:    bnez a0, .LBB26_4
+; RV32-NO-ATOMIC-NEXT:    mv a1, a0
+; RV32-NO-ATOMIC-NEXT:    lw a0, 0(sp)
+; RV32-NO-ATOMIC-NEXT:    bnez a1, .LBB26_4
 ; RV32-NO-ATOMIC-NEXT:  .LBB26_2: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NO-ATOMIC-NEXT:    mv a2, a1
-; RV32-NO-ATOMIC-NEXT:    bltu a1, s1, .LBB26_1
+; RV32-NO-ATOMIC-NEXT:    mv a2, a0
+; RV32-NO-ATOMIC-NEXT:    bltu a0, s1, .LBB26_1
 ; RV32-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NO-ATOMIC-NEXT:    # in Loop: Header=BB26_2 Depth=1
 ; RV32-NO-ATOMIC-NEXT:    li a2, 1
 ; RV32-NO-ATOMIC-NEXT:    j .LBB26_1
 ; RV32-NO-ATOMIC-NEXT:  .LBB26_4: # %atomicrmw.end
-; RV32-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV32-NO-ATOMIC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NO-ATOMIC-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1745,29 +1745,29 @@ define i32 @rmw32_umin_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    lw a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    lw a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    li s1, 2
 ; RV64-NO-ATOMIC-NEXT:    j .LBB26_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB26_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB26_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sw a1, 4(sp)
+; RV64-NO-ATOMIC-NEXT:    sw a0, 4(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 4
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_4
-; RV64-NO-ATOMIC-NEXT:    lw a1, 4(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB26_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    lw a0, 4(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB26_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB26_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    bltu a1, s1, .LBB26_1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    bltu a0, s1, .LBB26_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB26_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB26_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB26_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3348,43 +3348,43 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s0)
 ; RV32-NEXT:    j .LBB49_2
 ; RV32-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    lw a0, 0(sp)
 ; RV32-NEXT:    lw a1, 4(sp)
-; RV32-NEXT:    bnez a0, .LBB49_6
+; RV32-NEXT:    bnez a2, .LBB49_6
 ; RV32-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    beqz a1, .LBB49_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV32-NEXT:    sgtz a0, a1
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB49_1
+; RV32-NEXT:    sgtz a3, a1
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB49_1
 ; RV32-NEXT:    j .LBB49_5
 ; RV32-NEXT:  .LBB49_4: # in Loop: Header=BB49_2 Depth=1
-; RV32-NEXT:    sltiu a0, a4, 2
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB49_1
+; RV32-NEXT:    sltiu a2, a0, 2
+; RV32-NEXT:    xori a3, a2, 1
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB49_1
 ; RV32-NEXT:  .LBB49_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    j .LBB49_1
 ; RV32-NEXT:  .LBB49_6: # %atomicrmw.end
-; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3396,28 +3396,28 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    j .LBB49_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB49_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sd a1, 8(sp)
+; RV64-NO-ATOMIC-NEXT:    sd a0, 8(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 8
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_8
-; RV64-NO-ATOMIC-NEXT:    ld a1, 8(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB49_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    ld a0, 8(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB49_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB49_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    bgtz a1, .LBB49_1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    bgtz a0, .LBB49_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB49_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB49_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB49_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    addi sp, sp, 32
@@ -3453,42 +3453,42 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s0)
 ; RV32-NEXT:    j .LBB50_2
 ; RV32-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    lw a0, 0(sp)
 ; RV32-NEXT:    lw a1, 4(sp)
-; RV32-NEXT:    bnez a0, .LBB50_6
+; RV32-NEXT:    bnez a2, .LBB50_6
 ; RV32-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    beqz a1, .LBB50_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV32-NEXT:    slti a0, a1, 0
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB50_1
+; RV32-NEXT:    slti a3, a1, 0
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB50_1
 ; RV32-NEXT:    j .LBB50_5
 ; RV32-NEXT:  .LBB50_4: # in Loop: Header=BB50_2 Depth=1
-; RV32-NEXT:    sltiu a0, a4, 2
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB50_1
+; RV32-NEXT:    sltiu a3, a0, 2
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB50_1
 ; RV32-NEXT:  .LBB50_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    j .LBB50_1
 ; RV32-NEXT:  .LBB50_6: # %atomicrmw.end
-; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3501,29 +3501,29 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    li s1, 2
 ; RV64-NO-ATOMIC-NEXT:    j .LBB50_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sd a1, 0(sp)
+; RV64-NO-ATOMIC-NEXT:    sd a0, 0(sp)
 ; RV64-NO-ATOMIC-NEXT:    mv a1, sp
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_8
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB50_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB50_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB50_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    blt a1, s1, .LBB50_1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    blt a0, s1, .LBB50_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB50_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB50_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB50_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3560,37 +3560,37 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s0)
 ; RV32-NEXT:    j .LBB51_2
 ; RV32-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
-; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    lw a0, 0(sp)
 ; RV32-NEXT:    lw a1, 4(sp)
-; RV32-NEXT:    bnez a0, .LBB51_4
+; RV32-NEXT:    bnez a2, .LBB51_4
 ; RV32-NEXT:  .LBB51_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    snez a0, a1
-; RV32-NEXT:    sltiu a2, a4, 2
-; RV32-NEXT:    xori a2, a2, 1
-; RV32-NEXT:    or a0, a2, a0
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB51_1
+; RV32-NEXT:    snez a2, a1
+; RV32-NEXT:    sltiu a3, a0, 2
+; RV32-NEXT:    xori a3, a3, 1
+; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB51_1
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    j .LBB51_1
 ; RV32-NEXT:  .LBB51_4: # %atomicrmw.end
-; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3602,21 +3602,21 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    seqz a2, a1
-; RV64-NO-ATOMIC-NEXT:    add a2, a1, a2
-; RV64-NO-ATOMIC-NEXT:    sd a1, 8(sp)
+; RV64-NO-ATOMIC-NEXT:    seqz a2, a0
+; RV64-NO-ATOMIC-NEXT:    add a2, a0, a2
+; RV64-NO-ATOMIC-NEXT:    sd a0, 8(sp)
 ; RV64-NO-ATOMIC-NEXT:    addi a1, sp, 8
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_8
-; RV64-NO-ATOMIC-NEXT:    ld a1, 8(sp)
-; RV64-NO-ATOMIC-NEXT:    beqz a0, .LBB51_1
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    ld a0, 8(sp)
+; RV64-NO-ATOMIC-NEXT:    beqz a1, .LBB51_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.2: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    addi sp, sp, 32
@@ -3652,36 +3652,36 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lw a1, 4(s0)
 ; RV32-NEXT:    j .LBB52_2
 ; RV32-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a0, s0
 ; RV32-NEXT:    call __atomic_compare_exchange_8
-; RV32-NEXT:    lw a4, 0(sp)
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    lw a0, 0(sp)
 ; RV32-NEXT:    lw a1, 4(sp)
-; RV32-NEXT:    bnez a0, .LBB52_4
+; RV32-NEXT:    bnez a2, .LBB52_4
 ; RV32-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    sltiu a0, a4, 2
-; RV32-NEXT:    seqz a2, a1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    mv a2, a4
-; RV32-NEXT:    bnez a0, .LBB52_1
+; RV32-NEXT:    sltiu a2, a0, 2
+; RV32-NEXT:    seqz a3, a1
+; RV32-NEXT:    and a3, a3, a2
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    bnez a3, .LBB52_1
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    j .LBB52_1
 ; RV32-NEXT:  .LBB52_4: # %atomicrmw.end
-; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3694,29 +3694,29 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV64-NO-ATOMIC-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64-NO-ATOMIC-NEXT:    mv s0, a0
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(a0)
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(a0)
 ; RV64-NO-ATOMIC-NEXT:    li s1, 2
 ; RV64-NO-ATOMIC-NEXT:    j .LBB52_2
 ; RV64-NO-ATOMIC-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB52_2 Depth=1
-; RV64-NO-ATOMIC-NEXT:    sd a1, 0(sp)
+; RV64-NO-ATOMIC-NEXT:    sd a0, 0(sp)
 ; RV64-NO-ATOMIC-NEXT:    mv a1, sp
 ; RV64-NO-ATOMIC-NEXT:    li a3, 5
 ; RV64-NO-ATOMIC-NEXT:    li a4, 5
 ; RV64-NO-ATOMIC-NEXT:    mv a0, s0
 ; RV64-NO-ATOMIC-NEXT:    call __atomic_compare_exchange_8
-; RV64-NO-ATOMIC-NEXT:    ld a1, 0(sp)
-; RV64-NO-ATOMIC-NEXT:    bnez a0, .LBB52_4
+; RV64-NO-ATOMIC-NEXT:    mv a1, a0
+; RV64-NO-ATOMIC-NEXT:    ld a0, 0(sp)
+; RV64-NO-ATOMIC-NEXT:    bnez a1, .LBB52_4
 ; RV64-NO-ATOMIC-NEXT:  .LBB52_2: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NO-ATOMIC-NEXT:    mv a2, a1
-; RV64-NO-ATOMIC-NEXT:    bltu a1, s1, .LBB52_1
+; RV64-NO-ATOMIC-NEXT:    mv a2, a0
+; RV64-NO-ATOMIC-NEXT:    bltu a0, s1, .LBB52_1
 ; RV64-NO-ATOMIC-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV64-NO-ATOMIC-NEXT:    # in Loop: Header=BB52_2 Depth=1
 ; RV64-NO-ATOMIC-NEXT:    li a2, 1
 ; RV64-NO-ATOMIC-NEXT:    j .LBB52_1
 ; RV64-NO-ATOMIC-NEXT:  .LBB52_4: # %atomicrmw.end
-; RV64-NO-ATOMIC-NEXT:    mv a0, a1
 ; RV64-NO-ATOMIC-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64-NO-ATOMIC-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -4530,12 +4530,12 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:    lw a4, 0(a1)
 ; RV32-NEXT:    lw a3, 4(a1)
 ; RV32-NEXT:    lw a1, 8(a1)
-; RV32-NEXT:    lw a2, 12(s0)
-; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    lw a2, 12(s1)
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    addi a0, a4, 1
@@ -4559,7 +4559,7 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    mv a3, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a1, s0
+; RV32-NEXT:    mv a1, s1
 ; RV32-NEXT:    call __atomic_compare_exchange
 ; RV32-NEXT:    lw a4, 16(sp)
 ; RV32-NEXT:    lw a3, 20(sp)
@@ -4567,10 +4567,10 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    lw a2, 28(sp)
 ; RV32-NEXT:    beqz a0, .LBB62_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    sw a4, 0(s1)
-; RV32-NEXT:    sw a3, 4(s1)
-; RV32-NEXT:    sw a1, 8(s1)
-; RV32-NEXT:    sw a2, 12(s1)
+; RV32-NEXT:    sw a4, 0(s0)
+; RV32-NEXT:    sw a3, 4(s0)
+; RV32-NEXT:    sw a1, 8(s0)
+; RV32-NEXT:    sw a2, 12(s0)
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll b/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll
index e9b771a0698de..7da9bbbb079e9 100644
--- a/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll
+++ b/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll
@@ -306,12 +306,12 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs2, -64
+; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
 ; CHECK-NOFP16-RV64-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s2, 8(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s3, 16(a1)
-; CHECK-NOFP16-RV64-NEXT:    lhu a1, 24(a1)
-; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
-; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOFP16-RV64-NEXT:    lhu a0, 24(a1)
+; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOFP16-RV64-NEXT:    call __extendhfsf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, s3
@@ -330,8 +330,8 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s1, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s2, fs2
-; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s3, fs1
+; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    sh s1, 0(s0)
@@ -419,8 +419,8 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) {
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s1, fa0
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s2, fs1
-; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs3, fs3
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s3, fs2
+; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs3, fs3
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV32-NEXT:    sh s1, 0(s0)
@@ -485,12 +485,12 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs2, -64
+; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
 ; CHECK-NOFP16-RV64-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s2, 8(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s3, 16(a1)
-; CHECK-NOFP16-RV64-NEXT:    lhu a1, 24(a1)
-; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
-; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOFP16-RV64-NEXT:    lhu a0, 24(a1)
+; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOFP16-RV64-NEXT:    call __extendhfsf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, s3
@@ -509,8 +509,8 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s1, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s2, fs2
-; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s3, fs1
+; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    sh s1, 0(s0)
@@ -598,8 +598,8 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) {
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s1, fa0
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s2, fs1
-; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs3, fs3
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s3, fs2
+; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs3, fs3
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV32-NEXT:    sh s1, 0(s0)
@@ -688,6 +688,7 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs6, -128
+; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
 ; CHECK-NOFP16-RV64-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s2, 8(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s3, 16(a1)
@@ -695,9 +696,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    lhu s5, 32(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s6, 40(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s7, 48(a1)
-; CHECK-NOFP16-RV64-NEXT:    lhu a1, 56(a1)
-; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
-; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOFP16-RV64-NEXT:    lhu a0, 56(a1)
+; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOFP16-RV64-NEXT:    call __extendhfsf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, s7
@@ -740,8 +740,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s4, fs4
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s5, fs3
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s6, fs2
-; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s7, fs1
+; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    sh s5, 8(s0)
@@ -905,8 +905,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) {
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s4, fs5
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s5, fs6
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s6, fs7
-; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs1, fs1
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s7, fs3
+; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs1, fs1
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV32-NEXT:    sh s5, 8(s0)
@@ -1015,6 +1015,7 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOFP16-RV64-NEXT:    .cfi_offset fs6, -128
+; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
 ; CHECK-NOFP16-RV64-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s2, 8(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s3, 16(a1)
@@ -1022,9 +1023,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    lhu s5, 32(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s6, 40(a1)
 ; CHECK-NOFP16-RV64-NEXT:    lhu s7, 48(a1)
-; CHECK-NOFP16-RV64-NEXT:    lhu a1, 56(a1)
-; CHECK-NOFP16-RV64-NEXT:    mv s0, a0
-; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOFP16-RV64-NEXT:    lhu a0, 56(a1)
+; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOFP16-RV64-NEXT:    call __extendhfsf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.w.x fa0, s7
@@ -1067,8 +1067,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) {
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s4, fs4
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s5, fs3
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s6, fs2
-; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w s7, fs1
+; CHECK-NOFP16-RV64-NEXT:    fmin.s fa0, fs0, fs0
 ; CHECK-NOFP16-RV64-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV64-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV64-NEXT:    sh s5, 8(s0)
@@ -1232,8 +1232,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) {
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s4, fs5
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s5, fs6
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s6, fs7
-; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs1, fs1
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w s7, fs3
+; CHECK-NOFP16-RV32-NEXT:    fmin.s fa0, fs1, fs1
 ; CHECK-NOFP16-RV32-NEXT:    call __truncsfhf2
 ; CHECK-NOFP16-RV32-NEXT:    fmv.x.w a0, fa0
 ; CHECK-NOFP16-RV32-NEXT:    sh s5, 8(s0)
diff --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll
index a8e26f7686e50..443bd22c58a21 100644
--- a/llvm/test/CodeGen/RISCV/fp128.ll
+++ b/llvm/test/CodeGen/RISCV/fp128.ll
@@ -18,21 +18,21 @@ define i32 @test_load_and_cmp() nounwind {
 ; RV32I-NEXT:    lw a2, %lo(x)(a0)
 ; RV32I-NEXT:    lw a3, %lo(x+4)(a0)
 ; RV32I-NEXT:    lw a4, %lo(x+8)(a0)
-; RV32I-NEXT:    lw a5, %lo(x+12)(a0)
-; RV32I-NEXT:    lw a0, %lo(y)(a1)
+; RV32I-NEXT:    lw a0, %lo(x+12)(a0)
+; RV32I-NEXT:    lw a5, %lo(y)(a1)
 ; RV32I-NEXT:    lw a6, %lo(y+4)(a1)
 ; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
 ; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
-; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    sw a7, 16(sp)
 ; RV32I-NEXT:    sw a1, 20(sp)
-; RV32I-NEXT:    addi a0, sp, 24
-; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    sw a2, 24(sp)
 ; RV32I-NEXT:    sw a3, 28(sp)
 ; RV32I-NEXT:    sw a4, 32(sp)
-; RV32I-NEXT:    sw a5, 36(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    addi a0, sp, 24
+; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    call __netf2
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -52,35 +52,35 @@ define i32 @test_add_and_fptosi() nounwind {
 ; RV32I-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(x)
 ; RV32I-NEXT:    lui a1, %hi(y)
-; RV32I-NEXT:    lw a3, %lo(x)(a0)
-; RV32I-NEXT:    lw a4, %lo(x+4)(a0)
-; RV32I-NEXT:    lw a5, %lo(x+8)(a0)
-; RV32I-NEXT:    lw a6, %lo(x+12)(a0)
-; RV32I-NEXT:    lw a0, %lo(y)(a1)
-; RV32I-NEXT:    lw a2, %lo(y+4)(a1)
+; RV32I-NEXT:    lw a2, %lo(x)(a0)
+; RV32I-NEXT:    lw a3, %lo(x+4)(a0)
+; RV32I-NEXT:    lw a4, %lo(x+8)(a0)
+; RV32I-NEXT:    lw a0, %lo(x+12)(a0)
+; RV32I-NEXT:    lw a5, %lo(y)(a1)
+; RV32I-NEXT:    lw a6, %lo(y+4)(a1)
 ; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
 ; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
-; RV32I-NEXT:    sw a0, 24(sp)
-; RV32I-NEXT:    sw a2, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a6, 28(sp)
 ; RV32I-NEXT:    sw a7, 32(sp)
 ; RV32I-NEXT:    sw a1, 36(sp)
+; RV32I-NEXT:    sw a2, 40(sp)
+; RV32I-NEXT:    sw a3, 44(sp)
+; RV32I-NEXT:    sw a4, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    addi a0, sp, 56
 ; RV32I-NEXT:    addi a1, sp, 40
 ; RV32I-NEXT:    addi a2, sp, 24
-; RV32I-NEXT:    sw a3, 40(sp)
-; RV32I-NEXT:    sw a4, 44(sp)
-; RV32I-NEXT:    sw a5, 48(sp)
-; RV32I-NEXT:    sw a6, 52(sp)
 ; RV32I-NEXT:    call __addtf3
-; RV32I-NEXT:    lw a1, 56(sp)
-; RV32I-NEXT:    lw a2, 60(sp)
-; RV32I-NEXT:    lw a3, 64(sp)
-; RV32I-NEXT:    lw a4, 68(sp)
+; RV32I-NEXT:    lw a0, 56(sp)
+; RV32I-NEXT:    lw a1, 60(sp)
+; RV32I-NEXT:    lw a2, 64(sp)
+; RV32I-NEXT:    lw a3, 68(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    sw a2, 16(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
 ; RV32I-NEXT:    addi a0, sp, 8
-; RV32I-NEXT:    sw a1, 8(sp)
-; RV32I-NEXT:    sw a2, 12(sp)
-; RV32I-NEXT:    sw a3, 16(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
 ; RV32I-NEXT:    call __fixtfsi
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 80
@@ -101,26 +101,26 @@ define fp128 @fmaximum(fp128 %x, fp128 %y) {
 ; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    lw a7, 4(a2)
-; RV32I-NEXT:    lw t0, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    sw a1, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
 ; RV32I-NEXT:    sw a2, 20(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a4, 32(sp)
+; RV32I-NEXT:    sw a1, 36(sp)
 ; RV32I-NEXT:    addi a0, sp, 40
 ; RV32I-NEXT:    addi a1, sp, 24
 ; RV32I-NEXT:    addi a2, sp, 8
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a5, 32(sp)
-; RV32I-NEXT:    sw a6, 36(sp)
 ; RV32I-NEXT:    call fmaximuml
 ; RV32I-NEXT:    lw a0, 40(sp)
 ; RV32I-NEXT:    lw a1, 44(sp)
@@ -150,26 +150,26 @@ define fp128 @fminimum(fp128 %x, fp128 %y) {
 ; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    lw a7, 4(a2)
-; RV32I-NEXT:    lw t0, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    sw a1, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a6, 4(a2)
+; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw a2, 12(a2)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
 ; RV32I-NEXT:    sw a2, 20(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a4, 32(sp)
+; RV32I-NEXT:    sw a1, 36(sp)
 ; RV32I-NEXT:    addi a0, sp, 40
 ; RV32I-NEXT:    addi a1, sp, 24
 ; RV32I-NEXT:    addi a2, sp, 8
-; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a4, 28(sp)
-; RV32I-NEXT:    sw a5, 32(sp)
-; RV32I-NEXT:    sw a6, 36(sp)
 ; RV32I-NEXT:    call fminimuml
 ; RV32I-NEXT:    lw a0, 40(sp)
 ; RV32I-NEXT:    lw a1, 44(sp)
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index c5c3b199447a9..2c1503cc162ea 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -192,8 +192,8 @@ define i32 @ustest_f64i32(double %x) {
 ; RV32IF-NEXT:  .LBB2_3: # %entry
 ; RV32IF-NEXT:    addi a3, a2, -1
 ; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    or a0, a3, a0
+; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    beqz a1, .LBB2_5
 ; RV32IF-NEXT:  # %bb.4: # %entry
 ; RV32IF-NEXT:    sgtz a1, a1
@@ -501,8 +501,8 @@ define i32 @ustest_f16i32(half %x) {
 ; RV32-NEXT:  .LBB8_3: # %entry
 ; RV32-NEXT:    addi a3, a2, -1
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    beqz a1, .LBB8_5
 ; RV32-NEXT:  # %bb.4: # %entry
 ; RV32-NEXT:    sgtz a1, a1
@@ -1277,20 +1277,20 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixunsdfti
-; RV32IF-NEXT:    lw a0, 16(sp)
-; RV32IF-NEXT:    lw a1, 20(sp)
-; RV32IF-NEXT:    lw a2, 12(sp)
-; RV32IF-NEXT:    lw a3, 8(sp)
-; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    lw a0, 8(sp)
+; RV32IF-NEXT:    lw a1, 12(sp)
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 20(sp)
+; RV32IF-NEXT:    or a4, a3, a2
+; RV32IF-NEXT:    xori a2, a2, 1
 ; RV32IF-NEXT:    seqz a4, a4
-; RV32IF-NEXT:    or a0, a0, a1
-; RV32IF-NEXT:    seqz a0, a0
-; RV32IF-NEXT:    addi a0, a0, -1
-; RV32IF-NEXT:    and a0, a0, a4
-; RV32IF-NEXT:    neg a1, a0
-; RV32IF-NEXT:    and a0, a1, a3
-; RV32IF-NEXT:    and a1, a1, a2
+; RV32IF-NEXT:    or a2, a2, a3
+; RV32IF-NEXT:    seqz a2, a2
+; RV32IF-NEXT:    addi a2, a2, -1
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -1321,20 +1321,20 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixunsdfti
-; RV32IFD-NEXT:    lw a0, 16(sp)
-; RV32IFD-NEXT:    lw a1, 20(sp)
-; RV32IFD-NEXT:    lw a2, 12(sp)
-; RV32IFD-NEXT:    lw a3, 8(sp)
-; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 20(sp)
+; RV32IFD-NEXT:    or a4, a3, a2
+; RV32IFD-NEXT:    xori a2, a2, 1
 ; RV32IFD-NEXT:    seqz a4, a4
-; RV32IFD-NEXT:    or a0, a0, a1
-; RV32IFD-NEXT:    seqz a0, a0
-; RV32IFD-NEXT:    addi a0, a0, -1
-; RV32IFD-NEXT:    and a0, a0, a4
-; RV32IFD-NEXT:    neg a1, a0
-; RV32IFD-NEXT:    and a0, a1, a3
-; RV32IFD-NEXT:    and a1, a1, a2
+; RV32IFD-NEXT:    or a2, a2, a3
+; RV32IFD-NEXT:    seqz a2, a2
+; RV32IFD-NEXT:    addi a2, a2, -1
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    neg a2, a2
+; RV32IFD-NEXT:    and a0, a2, a0
+; RV32IFD-NEXT:    and a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -1359,8 +1359,8 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a1, 20(sp)
 ; RV32IF-NEXT:    lw a0, 16(sp)
+; RV32IF-NEXT:    lw a1, 20(sp)
 ; RV32IF-NEXT:    beqz a1, .LBB20_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
 ; RV32IF-NEXT:    slti a2, a1, 0
@@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IF-NEXT:  # %bb.4: # %entry
 ; RV32IF-NEXT:    li a0, 1
 ; RV32IF-NEXT:  .LBB20_5: # %entry
-; RV32IF-NEXT:    lw a3, 8(sp)
-; RV32IF-NEXT:    lw a4, 12(sp)
+; RV32IF-NEXT:    lw a4, 8(sp)
+; RV32IF-NEXT:    lw a3, 12(sp)
 ; RV32IF-NEXT:    and a5, a2, a1
 ; RV32IF-NEXT:    beqz a5, .LBB20_7
 ; RV32IF-NEXT:  # %bb.6: # %entry
@@ -1388,17 +1388,17 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IF-NEXT:  .LBB20_7:
 ; RV32IF-NEXT:    snez a1, a0
 ; RV32IF-NEXT:  .LBB20_8: # %entry
-; RV32IF-NEXT:    and a4, a2, a4
+; RV32IF-NEXT:    and a3, a2, a3
+; RV32IF-NEXT:    and a2, a2, a4
 ; RV32IF-NEXT:    or a0, a0, a5
-; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    bnez a0, .LBB20_10
 ; RV32IF-NEXT:  # %bb.9:
-; RV32IF-NEXT:    or a0, a2, a4
+; RV32IF-NEXT:    or a0, a2, a3
 ; RV32IF-NEXT:    snez a1, a0
 ; RV32IF-NEXT:  .LBB20_10: # %entry
 ; RV32IF-NEXT:    neg a1, a1
 ; RV32IF-NEXT:    and a0, a1, a2
-; RV32IF-NEXT:    and a1, a1, a4
+; RV32IF-NEXT:    and a1, a1, a3
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -1442,8 +1442,8 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a1, 20(sp)
 ; RV32IFD-NEXT:    lw a0, 16(sp)
+; RV32IFD-NEXT:    lw a1, 20(sp)
 ; RV32IFD-NEXT:    beqz a1, .LBB20_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
 ; RV32IFD-NEXT:    slti a2, a1, 0
@@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IFD-NEXT:  # %bb.4: # %entry
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:  .LBB20_5: # %entry
-; RV32IFD-NEXT:    lw a3, 8(sp)
-; RV32IFD-NEXT:    lw a4, 12(sp)
+; RV32IFD-NEXT:    lw a4, 8(sp)
+; RV32IFD-NEXT:    lw a3, 12(sp)
 ; RV32IFD-NEXT:    and a5, a2, a1
 ; RV32IFD-NEXT:    beqz a5, .LBB20_7
 ; RV32IFD-NEXT:  # %bb.6: # %entry
@@ -1471,17 +1471,17 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IFD-NEXT:  .LBB20_7:
 ; RV32IFD-NEXT:    snez a1, a0
 ; RV32IFD-NEXT:  .LBB20_8: # %entry
-; RV32IFD-NEXT:    and a4, a2, a4
+; RV32IFD-NEXT:    and a3, a2, a3
+; RV32IFD-NEXT:    and a2, a2, a4
 ; RV32IFD-NEXT:    or a0, a0, a5
-; RV32IFD-NEXT:    and a2, a2, a3
 ; RV32IFD-NEXT:    bnez a0, .LBB20_10
 ; RV32IFD-NEXT:  # %bb.9:
-; RV32IFD-NEXT:    or a0, a2, a4
+; RV32IFD-NEXT:    or a0, a2, a3
 ; RV32IFD-NEXT:    snez a1, a0
 ; RV32IFD-NEXT:  .LBB20_10: # %entry
 ; RV32IFD-NEXT:    neg a1, a1
 ; RV32IFD-NEXT:    and a0, a1, a2
-; RV32IFD-NEXT:    and a1, a1, a4
+; RV32IFD-NEXT:    and a1, a1, a3
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -1587,20 +1587,20 @@ define i64 @utest_f32i64(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixunssfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    or a4, a3, a2
+; RV32-NEXT:    xori a2, a2, 1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -1639,8 +1639,8 @@ define i64 @ustest_f32i64(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a1, 20(sp)
 ; RV32-NEXT:    lw a0, 16(sp)
+; RV32-NEXT:    lw a1, 20(sp)
 ; RV32-NEXT:    beqz a1, .LBB23_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    slti a2, a1, 0
@@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) {
 ; RV32-NEXT:  # %bb.4: # %entry
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:  .LBB23_5: # %entry
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    lw a4, 12(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
 ; RV32-NEXT:    and a5, a2, a1
 ; RV32-NEXT:    beqz a5, .LBB23_7
 ; RV32-NEXT:  # %bb.6: # %entry
@@ -1668,17 +1668,17 @@ define i64 @ustest_f32i64(float %x) {
 ; RV32-NEXT:  .LBB23_7:
 ; RV32-NEXT:    snez a1, a0
 ; RV32-NEXT:  .LBB23_8: # %entry
-; RV32-NEXT:    and a4, a2, a4
+; RV32-NEXT:    and a3, a2, a3
+; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    or a0, a0, a5
-; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    bnez a0, .LBB23_10
 ; RV32-NEXT:  # %bb.9:
-; RV32-NEXT:    or a0, a2, a4
+; RV32-NEXT:    or a0, a2, a3
 ; RV32-NEXT:    snez a1, a0
 ; RV32-NEXT:  .LBB23_10: # %entry
 ; RV32-NEXT:    neg a1, a1
 ; RV32-NEXT:    and a0, a1, a2
-; RV32-NEXT:    and a1, a1, a4
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -1848,20 +1848,20 @@ define i64 @utesth_f16i64(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixunssfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    or a4, a3, a2
+; RV32-NEXT:    xori a2, a2, 1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -1902,8 +1902,8 @@ define i64 @ustest_f16i64(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a1, 20(sp)
 ; RV32-NEXT:    lw a0, 16(sp)
+; RV32-NEXT:    lw a1, 20(sp)
 ; RV32-NEXT:    beqz a1, .LBB26_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    slti a2, a1, 0
@@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) {
 ; RV32-NEXT:  # %bb.4: # %entry
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:  .LBB26_5: # %entry
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    lw a4, 12(sp)
+; RV32-NEXT:    lw a4, 8(sp)
+; RV32-NEXT:    lw a3, 12(sp)
 ; RV32-NEXT:    and a5, a2, a1
 ; RV32-NEXT:    beqz a5, .LBB26_7
 ; RV32-NEXT:  # %bb.6: # %entry
@@ -1931,17 +1931,17 @@ define i64 @ustest_f16i64(half %x) {
 ; RV32-NEXT:  .LBB26_7:
 ; RV32-NEXT:    snez a1, a0
 ; RV32-NEXT:  .LBB26_8: # %entry
-; RV32-NEXT:    and a4, a2, a4
+; RV32-NEXT:    and a3, a2, a3
+; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    or a0, a0, a5
-; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    bnez a0, .LBB26_10
 ; RV32-NEXT:  # %bb.9:
-; RV32-NEXT:    or a0, a2, a4
+; RV32-NEXT:    or a0, a2, a3
 ; RV32-NEXT:    snez a1, a0
 ; RV32-NEXT:  .LBB26_10: # %entry
 ; RV32-NEXT:    neg a1, a1
 ; RV32-NEXT:    and a0, a1, a2
-; RV32-NEXT:    and a1, a1, a4
+; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3211,20 +3211,20 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixunsdfti
-; RV32IF-NEXT:    lw a0, 16(sp)
-; RV32IF-NEXT:    lw a1, 20(sp)
-; RV32IF-NEXT:    lw a2, 12(sp)
-; RV32IF-NEXT:    lw a3, 8(sp)
-; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    lw a0, 8(sp)
+; RV32IF-NEXT:    lw a1, 12(sp)
+; RV32IF-NEXT:    lw a2, 16(sp)
+; RV32IF-NEXT:    lw a3, 20(sp)
+; RV32IF-NEXT:    or a4, a3, a2
+; RV32IF-NEXT:    xori a2, a2, 1
 ; RV32IF-NEXT:    seqz a4, a4
-; RV32IF-NEXT:    or a0, a0, a1
-; RV32IF-NEXT:    seqz a0, a0
-; RV32IF-NEXT:    addi a0, a0, -1
-; RV32IF-NEXT:    and a0, a0, a4
-; RV32IF-NEXT:    neg a1, a0
-; RV32IF-NEXT:    and a0, a1, a3
-; RV32IF-NEXT:    and a1, a1, a2
+; RV32IF-NEXT:    or a2, a2, a3
+; RV32IF-NEXT:    seqz a2, a2
+; RV32IF-NEXT:    addi a2, a2, -1
+; RV32IF-NEXT:    and a2, a2, a4
+; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3255,20 +3255,20 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixunsdfti
-; RV32IFD-NEXT:    lw a0, 16(sp)
-; RV32IFD-NEXT:    lw a1, 20(sp)
-; RV32IFD-NEXT:    lw a2, 12(sp)
-; RV32IFD-NEXT:    lw a3, 8(sp)
-; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    lw a2, 16(sp)
+; RV32IFD-NEXT:    lw a3, 20(sp)
+; RV32IFD-NEXT:    or a4, a3, a2
+; RV32IFD-NEXT:    xori a2, a2, 1
 ; RV32IFD-NEXT:    seqz a4, a4
-; RV32IFD-NEXT:    or a0, a0, a1
-; RV32IFD-NEXT:    seqz a0, a0
-; RV32IFD-NEXT:    addi a0, a0, -1
-; RV32IFD-NEXT:    and a0, a0, a4
-; RV32IFD-NEXT:    neg a1, a0
-; RV32IFD-NEXT:    and a0, a1, a3
-; RV32IFD-NEXT:    and a1, a1, a2
+; RV32IFD-NEXT:    or a2, a2, a3
+; RV32IFD-NEXT:    seqz a2, a2
+; RV32IFD-NEXT:    addi a2, a2, -1
+; RV32IFD-NEXT:    and a2, a2, a4
+; RV32IFD-NEXT:    neg a2, a2
+; RV32IFD-NEXT:    and a0, a2, a0
+; RV32IFD-NEXT:    and a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 20(sp)
-; RV32IF-NEXT:    lw a1, 8(sp)
+; RV32IF-NEXT:    lw a0, 8(sp)
 ; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a0, .LBB47_2
+; RV32IF-NEXT:    lw a1, 20(sp)
+; RV32IF-NEXT:    beqz a1, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a0, 0
+; RV32IF-NEXT:    slti a4, a1, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a0
+; RV32IF-NEXT:    or a3, a3, a1
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    slti a0, a0, 0
-; RV32IF-NEXT:    addi a3, a0, -1
-; RV32IF-NEXT:    and a0, a3, a1
-; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    and a1, a3, a1
+; RV32IF-NEXT:    slti a1, a1, 0
+; RV32IF-NEXT:    addi a1, a1, -1
+; RV32IF-NEXT:    and a0, a1, a0
+; RV32IF-NEXT:    and a1, a1, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 20(sp)
-; RV32IFD-NEXT:    lw a1, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
 ; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a0, .LBB47_2
+; RV32IFD-NEXT:    lw a1, 20(sp)
+; RV32IFD-NEXT:    beqz a1, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a0, 0
+; RV32IFD-NEXT:    slti a4, a1, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a0
+; RV32IFD-NEXT:    or a3, a3, a1
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
 ; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    slti a0, a0, 0
-; RV32IFD-NEXT:    addi a3, a0, -1
-; RV32IFD-NEXT:    and a0, a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    and a1, a3, a1
+; RV32IFD-NEXT:    slti a1, a1, 0
+; RV32IFD-NEXT:    addi a1, a1, -1
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a1, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3479,20 +3479,20 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixunssfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    or a4, a3, a2
+; RV32-NEXT:    xori a2, a2, 1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a0, 8(sp)
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a0, .LBB50_2
+; RV32-NEXT:    lw a1, 20(sp)
+; RV32-NEXT:    beqz a1, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a0, 0
+; RV32-NEXT:    slti a4, a1, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a0
+; RV32-NEXT:    or a3, a3, a1
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    slti a0, a0, 0
-; RV32-NEXT:    addi a3, a0, -1
-; RV32-NEXT:    and a0, a3, a1
-; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3714,20 +3714,20 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixunssfti
-; RV32-NEXT:    lw a0, 16(sp)
-; RV32-NEXT:    lw a1, 20(sp)
-; RV32-NEXT:    lw a2, 12(sp)
-; RV32-NEXT:    lw a3, 8(sp)
-; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    or a4, a3, a2
+; RV32-NEXT:    xori a2, a2, 1
 ; RV32-NEXT:    seqz a4, a4
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a0, 8(sp)
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a0, .LBB53_2
+; RV32-NEXT:    lw a1, 20(sp)
+; RV32-NEXT:    beqz a1, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a0, 0
+; RV32-NEXT:    slti a4, a1, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a0
+; RV32-NEXT:    or a3, a3, a1
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    slti a0, a0, 0
-; RV32-NEXT:    addi a3, a0, -1
-; RV32-NEXT:    and a0, a3, a1
-; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll b/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll
index 9322abcfbbdce..9ca527573e0c6 100644
--- a/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll
+++ b/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll
@@ -5,22 +5,22 @@
 define void @getSetCCResultType(ptr %p, ptr %q) nounwind {
 ; RV32I-LABEL: getSetCCResultType:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    lw a2, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    snez a2, a2
-; RV32I-NEXT:    snez a3, a3
+; RV32I-NEXT:    lw a1, 0(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
+; RV32I-NEXT:    lw a3, 8(a0)
+; RV32I-NEXT:    lw a4, 12(a0)
 ; RV32I-NEXT:    snez a4, a4
-; RV32I-NEXT:    addi a4, a4, -1
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    snez a3, a3
+; RV32I-NEXT:    snez a2, a2
+; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    addi a3, a3, -1
+; RV32I-NEXT:    addi a4, a4, -1
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 entry:
   %0 = load <4 x i32>, ptr %p, align 16
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index a218e89948d4b..690bf6c284eb2 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -2885,14 +2885,14 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ;
 ; RV32IZFHMIN-LABEL: fsgnjx_f16:
 ; RV32IZFHMIN:       # %bb.0:
-; RV32IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
-; RV32IZFHMIN-NEXT:    lhu a0, %lo(.LCPI23_0)(a0)
-; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa0
-; RV32IZFHMIN-NEXT:    lui a2, 1048568
-; RV32IZFHMIN-NEXT:    and a1, a1, a2
-; RV32IZFHMIN-NEXT:    slli a0, a0, 17
-; RV32IZFHMIN-NEXT:    srli a0, a0, 17
-; RV32IZFHMIN-NEXT:    or a0, a0, a1
+; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV32IZFHMIN-NEXT:    lui a1, 1048568
+; RV32IZFHMIN-NEXT:    and a0, a0, a1
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI23_0)
+; RV32IZFHMIN-NEXT:    lhu a1, %lo(.LCPI23_0)(a1)
+; RV32IZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IZFHMIN-NEXT:    srli a1, a1, 17
+; RV32IZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
@@ -2902,14 +2902,14 @@ define half @fsgnjx_f16(half %x, half %y) nounwind {
 ;
 ; RV64IZFHMIN-LABEL: fsgnjx_f16:
 ; RV64IZFHMIN:       # %bb.0:
-; RV64IZFHMIN-NEXT:    lui a0, %hi(.LCPI23_0)
-; RV64IZFHMIN-NEXT:    lhu a0, %lo(.LCPI23_0)(a0)
-; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa0
-; RV64IZFHMIN-NEXT:    lui a2, 1048568
-; RV64IZFHMIN-NEXT:    and a1, a1, a2
-; RV64IZFHMIN-NEXT:    slli a0, a0, 49
-; RV64IZFHMIN-NEXT:    srli a0, a0, 49
-; RV64IZFHMIN-NEXT:    or a0, a0, a1
+; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV64IZFHMIN-NEXT:    lui a1, 1048568
+; RV64IZFHMIN-NEXT:    and a0, a0, a1
+; RV64IZFHMIN-NEXT:    lui a1, %hi(.LCPI23_0)
+; RV64IZFHMIN-NEXT:    lhu a1, %lo(.LCPI23_0)(a1)
+; RV64IZFHMIN-NEXT:    slli a1, a1, 49
+; RV64IZFHMIN-NEXT:    srli a1, a1, 49
+; RV64IZFHMIN-NEXT:    or a0, a1, a0
 ; RV64IZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa1
diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
index 0a04d44893e75..5396fab3437c7 100644
--- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
@@ -2519,12 +2519,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; RV32IZFH-NEXT:    lw a1, 0(a0)
 ; RV32IZFH-NEXT:    lw a2, 4(a0)
 ; RV32IZFH-NEXT:    lw a3, 8(a0)
-; RV32IZFH-NEXT:    lw a4, 12(a0)
-; RV32IZFH-NEXT:    addi a0, sp, 8
+; RV32IZFH-NEXT:    lw a0, 12(a0)
 ; RV32IZFH-NEXT:    sw a1, 8(sp)
 ; RV32IZFH-NEXT:    sw a2, 12(sp)
 ; RV32IZFH-NEXT:    sw a3, 16(sp)
-; RV32IZFH-NEXT:    sw a4, 20(sp)
+; RV32IZFH-NEXT:    sw a0, 20(sp)
+; RV32IZFH-NEXT:    addi a0, sp, 8
 ; RV32IZFH-NEXT:    call __trunctfhf2
 ; RV32IZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 32
@@ -2546,12 +2546,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; RV32IZHINX-NEXT:    lw a1, 0(a0)
 ; RV32IZHINX-NEXT:    lw a2, 4(a0)
 ; RV32IZHINX-NEXT:    lw a3, 8(a0)
-; RV32IZHINX-NEXT:    lw a4, 12(a0)
-; RV32IZHINX-NEXT:    addi a0, sp, 8
+; RV32IZHINX-NEXT:    lw a0, 12(a0)
 ; RV32IZHINX-NEXT:    sw a1, 8(sp)
 ; RV32IZHINX-NEXT:    sw a2, 12(sp)
 ; RV32IZHINX-NEXT:    sw a3, 16(sp)
-; RV32IZHINX-NEXT:    sw a4, 20(sp)
+; RV32IZHINX-NEXT:    sw a0, 20(sp)
+; RV32IZHINX-NEXT:    addi a0, sp, 8
 ; RV32IZHINX-NEXT:    call __trunctfhf2
 ; RV32IZHINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 32
@@ -2573,12 +2573,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; RV32IDZFH-NEXT:    lw a1, 0(a0)
 ; RV32IDZFH-NEXT:    lw a2, 4(a0)
 ; RV32IDZFH-NEXT:    lw a3, 8(a0)
-; RV32IDZFH-NEXT:    lw a4, 12(a0)
-; RV32IDZFH-NEXT:    addi a0, sp, 8
+; RV32IDZFH-NEXT:    lw a0, 12(a0)
 ; RV32IDZFH-NEXT:    sw a1, 8(sp)
 ; RV32IDZFH-NEXT:    sw a2, 12(sp)
 ; RV32IDZFH-NEXT:    sw a3, 16(sp)
-; RV32IDZFH-NEXT:    sw a4, 20(sp)
+; RV32IDZFH-NEXT:    sw a0, 20(sp)
+; RV32IDZFH-NEXT:    addi a0, sp, 8
 ; RV32IDZFH-NEXT:    call __trunctfhf2
 ; RV32IDZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    addi sp, sp, 32
@@ -2600,12 +2600,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; RV32IZDINXZHINX-NEXT:    lw a1, 0(a0)
 ; RV32IZDINXZHINX-NEXT:    lw a2, 4(a0)
 ; RV32IZDINXZHINX-NEXT:    lw a3, 8(a0)
-; RV32IZDINXZHINX-NEXT:    lw a4, 12(a0)
-; RV32IZDINXZHINX-NEXT:    addi a0, sp, 8
+; RV32IZDINXZHINX-NEXT:    lw a0, 12(a0)
 ; RV32IZDINXZHINX-NEXT:    sw a1, 8(sp)
 ; RV32IZDINXZHINX-NEXT:    sw a2, 12(sp)
 ; RV32IZDINXZHINX-NEXT:    sw a3, 16(sp)
-; RV32IZDINXZHINX-NEXT:    sw a4, 20(sp)
+; RV32IZDINXZHINX-NEXT:    sw a0, 20(sp)
+; RV32IZDINXZHINX-NEXT:    addi a0, sp, 8
 ; RV32IZDINXZHINX-NEXT:    call __trunctfhf2
 ; RV32IZDINXZHINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    addi sp, sp, 32
@@ -2627,12 +2627,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; CHECK32-IZFHMIN-NEXT:    lw a1, 0(a0)
 ; CHECK32-IZFHMIN-NEXT:    lw a2, 4(a0)
 ; CHECK32-IZFHMIN-NEXT:    lw a3, 8(a0)
-; CHECK32-IZFHMIN-NEXT:    lw a4, 12(a0)
-; CHECK32-IZFHMIN-NEXT:    addi a0, sp, 8
+; CHECK32-IZFHMIN-NEXT:    lw a0, 12(a0)
 ; CHECK32-IZFHMIN-NEXT:    sw a1, 8(sp)
 ; CHECK32-IZFHMIN-NEXT:    sw a2, 12(sp)
 ; CHECK32-IZFHMIN-NEXT:    sw a3, 16(sp)
-; CHECK32-IZFHMIN-NEXT:    sw a4, 20(sp)
+; CHECK32-IZFHMIN-NEXT:    sw a0, 20(sp)
+; CHECK32-IZFHMIN-NEXT:    addi a0, sp, 8
 ; CHECK32-IZFHMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK32-IZFHMIN-NEXT:    addi sp, sp, 32
@@ -2654,12 +2654,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; CHECK32-IZHINXMIN-NEXT:    lw a1, 0(a0)
 ; CHECK32-IZHINXMIN-NEXT:    lw a2, 4(a0)
 ; CHECK32-IZHINXMIN-NEXT:    lw a3, 8(a0)
-; CHECK32-IZHINXMIN-NEXT:    lw a4, 12(a0)
-; CHECK32-IZHINXMIN-NEXT:    addi a0, sp, 8
+; CHECK32-IZHINXMIN-NEXT:    lw a0, 12(a0)
 ; CHECK32-IZHINXMIN-NEXT:    sw a1, 8(sp)
 ; CHECK32-IZHINXMIN-NEXT:    sw a2, 12(sp)
 ; CHECK32-IZHINXMIN-NEXT:    sw a3, 16(sp)
-; CHECK32-IZHINXMIN-NEXT:    sw a4, 20(sp)
+; CHECK32-IZHINXMIN-NEXT:    sw a0, 20(sp)
+; CHECK32-IZHINXMIN-NEXT:    addi a0, sp, 8
 ; CHECK32-IZHINXMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    addi sp, sp, 32
@@ -2681,12 +2681,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, 0(a0)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw a2, 4(a0)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw a3, 8(a0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a4, 12(a0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    addi a0, sp, 8
+; CHECK32-IZDINXZHINXMIN-NEXT:    lw a0, 12(a0)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw a1, 8(sp)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw a2, 12(sp)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw a3, 16(sp)
-; CHECK32-IZDINXZHINXMIN-NEXT:    sw a4, 20(sp)
+; CHECK32-IZDINXZHINXMIN-NEXT:    sw a0, 20(sp)
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a0, sp, 8
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi sp, sp, 32
@@ -2708,12 +2708,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp {
 ; CHECK32-D-NEXT:    lw a1, 0(a0)
 ; CHECK32-D-NEXT:    lw a2, 4(a0)
 ; CHECK32-D-NEXT:    lw a3, 8(a0)
-; CHECK32-D-NEXT:    lw a4, 12(a0)
-; CHECK32-D-NEXT:    addi a0, sp, 8
+; CHECK32-D-NEXT:    lw a0, 12(a0)
 ; CHECK32-D-NEXT:    sw a1, 8(sp)
 ; CHECK32-D-NEXT:    sw a2, 12(sp)
 ; CHECK32-D-NEXT:    sw a3, 16(sp)
-; CHECK32-D-NEXT:    sw a4, 20(sp)
+; CHECK32-D-NEXT:    sw a0, 20(sp)
+; CHECK32-D-NEXT:    addi a0, sp, 8
 ; CHECK32-D-NEXT:    call __trunctfhf2
 ; CHECK32-D-NEXT:    fmv.x.w a0, fa0
 ; CHECK32-D-NEXT:    lui a1, 1048560
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index cf57ecd6cd1e4..7841f0209ce24 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -194,13 +194,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_si_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV32IZFH-NEXT:    lui a0, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a0
-; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IZFH-NEXT:    neg a0, a1
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IZFH-NEXT:    fmv.w.x fa4, a0
+; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
+; RV32IZFH-NEXT:    neg a0, a0
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -209,13 +209,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_si_h_sat:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV64IZFH-NEXT:    lui a0, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a0
-; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IZFH-NEXT:    neg a0, a1
+; RV64IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64IZFH-NEXT:    fmv.w.x fa4, a0
+; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
+; RV64IZFH-NEXT:    neg a0, a0
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -224,13 +224,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_si_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV32IDZFH-NEXT:    lui a0, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
-; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IDZFH-NEXT:    neg a0, a1
+; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
+; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
+; RV32IDZFH-NEXT:    neg a0, a0
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -239,13 +239,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_si_h_sat:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV64IDZFH-NEXT:    lui a0, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
-; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IDZFH-NEXT:    neg a0, a1
+; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
+; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
+; RV64IDZFH-NEXT:    neg a0, a0
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -399,13 +399,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV32ID-ILP32-NEXT:    lui a0, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
-; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
-; RV32ID-ILP32-NEXT:    neg a0, a1
+; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
+; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
+; RV32ID-ILP32-NEXT:    neg a0, a0
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -419,13 +419,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI1_0)
-; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; RV64ID-LP64-NEXT:    lui a0, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
-; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
-; RV64ID-LP64-NEXT:    neg a0, a1
+; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
+; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
+; RV64ID-LP64-NEXT:    neg a0, a0
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -439,13 +439,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
 ; RV32ID-NEXT:    lui a1, 815104
-; RV32ID-NEXT:    fmv.w.x fa4, a1
-; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV32ID-NEXT:    fmv.w.x fa5, a1
+; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32ID-NEXT:    neg a0, a0
-; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
+; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -458,13 +458,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
 ; RV64ID-NEXT:    lui a1, 815104
-; RV64ID-NEXT:    fmv.w.x fa4, a1
-; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV64ID-NEXT:    fmv.w.x fa5, a1
+; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV64ID-NEXT:    neg a0, a0
-; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
+; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
 ; RV64ID-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -474,13 +474,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32-IZFHMIN-NEXT:    neg a0, a1
+; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    neg a0, a0
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -489,13 +489,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
 ; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64-IZFHMIN-NEXT:    neg a0, a1
+; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    neg a0, a0
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -711,45 +711,45 @@ define i16 @fcvt_ui_h(half %a) nounwind {
 define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_ui_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
+; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFH-NEXT:    fmv.w.x fa4, zero
 ; RV32IZFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV32IZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV32IZFH-NEXT:    fmv.w.x fa3, zero
-; RV32IZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV32IZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: fcvt_ui_h_sat:
 ; RV64IZFH:       # %bb.0: # %start
+; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFH-NEXT:    fmv.w.x fa4, zero
 ; RV64IZFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64IZFH-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV64IZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV64IZFH-NEXT:    fmv.w.x fa3, zero
-; RV64IZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV64IZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64IZFH-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_ui_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
+; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
 ; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV32IDZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV32IDZFH-NEXT:    fmv.w.x fa3, zero
-; RV32IDZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV32IDZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32IDZFH-NEXT:    ret
 ;
 ; RV64IDZFH-LABEL: fcvt_ui_h_sat:
 ; RV64IDZFH:       # %bb.0: # %start
+; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IDZFH-NEXT:    fmv.w.x fa4, zero
 ; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64IDZFH-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV64IDZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV64IDZFH-NEXT:    fmv.w.x fa3, zero
-; RV64IDZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV64IDZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64IDZFH-NEXT:    ret
 ;
@@ -874,12 +874,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI3_0)(a1)
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, zero
-; RV32ID-ILP32-NEXT:    fmax.s fa4, fa4, fa3
-; RV32ID-ILP32-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, zero
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    addi sp, sp, 16
@@ -890,12 +890,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64ID-LP64-NEXT:    flw fa5, %lo(.LCPI3_0)(a1)
-; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, zero
-; RV64ID-LP64-NEXT:    fmax.s fa4, fa4, fa3
-; RV64ID-LP64-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, zero
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64ID-LP64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64ID-LP64-NEXT:    addi sp, sp, 16
@@ -906,11 +906,11 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    addi sp, sp, -16
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
+; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV32ID-NEXT:    fmv.w.x fa4, zero
-; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
-; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    addi sp, sp, 16
@@ -921,11 +921,11 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64ID-NEXT:    addi sp, sp, -16
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
+; RV64ID-NEXT:    fmv.w.x fa5, zero
 ; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; RV64ID-NEXT:    fmv.w.x fa4, zero
-; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
-; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64ID-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    addi sp, sp, 16
@@ -933,23 +933,23 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ;
 ; CHECK32-IZFHMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
+; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    ret
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
+; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
-; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI3_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    ret
 ;
@@ -2904,14 +2904,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
 ; RV32IZFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; RV32IZFH-NEXT:    fmv.w.x fa5, zero
+; RV32IZFH-NEXT:    fle.s a1, fa5, fa0
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IZFH-NEXT:    fle.s a0, fa4, fa0
-; RV32IZFH-NEXT:    flt.s a1, fa5, fa0
-; RV32IZFH-NEXT:    neg s0, a1
-; RV32IZFH-NEXT:    neg s1, a0
+; RV32IZFH-NEXT:    flt.s a0, fa5, fa0
+; RV32IZFH-NEXT:    neg s0, a0
+; RV32IZFH-NEXT:    neg s1, a1
 ; RV32IZFH-NEXT:    call __fixunssfdi
 ; RV32IZFH-NEXT:    and a0, s1, a0
 ; RV32IZFH-NEXT:    and a1, s1, a1
@@ -2938,14 +2938,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IDZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IDZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT:    fcvt.s.h fa0, fa0
 ; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; RV32IDZFH-NEXT:    fmv.w.x fa5, zero
+; RV32IDZFH-NEXT:    fle.s a1, fa5, fa0
 ; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; RV32IDZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IDZFH-NEXT:    fle.s a0, fa4, fa0
-; RV32IDZFH-NEXT:    flt.s a1, fa5, fa0
-; RV32IDZFH-NEXT:    neg s0, a1
-; RV32IDZFH-NEXT:    neg s1, a0
+; RV32IDZFH-NEXT:    flt.s a0, fa5, fa0
+; RV32IDZFH-NEXT:    neg s0, a0
+; RV32IDZFH-NEXT:    neg s1, a1
 ; RV32IDZFH-NEXT:    call __fixunssfdi
 ; RV32IDZFH-NEXT:    and a0, s1, a0
 ; RV32IDZFH-NEXT:    and a1, s1, a1
@@ -3103,14 +3103,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
 ; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI12_0)
-; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, zero
-; RV32ID-ILP32-NEXT:    fle.s a1, fa3, fa4
-; RV32ID-ILP32-NEXT:    flt.s a2, fa5, fa4
-; RV32ID-ILP32-NEXT:    neg s0, a2
-; RV32ID-ILP32-NEXT:    neg s1, a1
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, zero
+; RV32ID-ILP32-NEXT:    fle.s a2, fa4, fa5
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI12_0)(a1)
+; RV32ID-ILP32-NEXT:    flt.s a1, fa4, fa5
+; RV32ID-ILP32-NEXT:    neg s0, a1
+; RV32ID-ILP32-NEXT:    neg s1, a2
 ; RV32ID-ILP32-NEXT:    call __fixunssfdi
 ; RV32ID-ILP32-NEXT:    and a0, s1, a0
 ; RV32ID-ILP32-NEXT:    and a1, s1, a1
@@ -3145,12 +3145,12 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    lui a0, %hi(.LCPI12_0)
+; RV32ID-NEXT:    fmv.w.x fa5, zero
+; RV32ID-NEXT:    fle.s a1, fa5, fa0
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; RV32ID-NEXT:    fmv.w.x fa4, zero
-; RV32ID-NEXT:    fle.s a0, fa4, fa0
-; RV32ID-NEXT:    flt.s a1, fa5, fa0
-; RV32ID-NEXT:    neg s0, a1
-; RV32ID-NEXT:    neg s1, a0
+; RV32ID-NEXT:    flt.s a0, fa5, fa0
+; RV32ID-NEXT:    neg s0, a0
+; RV32ID-NEXT:    neg s1, a1
 ; RV32ID-NEXT:    call __fixunssfdi
 ; RV32ID-NEXT:    and a0, s1, a0
 ; RV32ID-NEXT:    and a1, s1, a1
@@ -3182,14 +3182,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK32-IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK32-IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
 ; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI12_0)
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa5, zero
+; CHECK32-IZFHMIN-NEXT:    fle.s a1, fa5, fa0
 ; CHECK32-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32-IZFHMIN-NEXT:    fle.s a0, fa4, fa0
-; CHECK32-IZFHMIN-NEXT:    flt.s a1, fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    neg s0, a1
-; CHECK32-IZFHMIN-NEXT:    neg s1, a0
+; CHECK32-IZFHMIN-NEXT:    flt.s a0, fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    neg s0, a0
+; CHECK32-IZFHMIN-NEXT:    neg s1, a1
 ; CHECK32-IZFHMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZFHMIN-NEXT:    and a0, s1, a0
 ; CHECK32-IZFHMIN-NEXT:    and a1, s1, a1
@@ -6296,13 +6296,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV32IZFH-NEXT:    lui a0, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a0
-; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IZFH-NEXT:    neg a0, a1
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV32IZFH-NEXT:    fmv.w.x fa4, a0
+; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
+; RV32IZFH-NEXT:    neg a0, a0
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -6311,13 +6311,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV64IZFH-NEXT:    lui a0, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a0
-; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IZFH-NEXT:    neg a0, a1
+; RV64IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV64IZFH-NEXT:    fmv.w.x fa4, a0
+; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
+; RV64IZFH-NEXT:    neg a0, a0
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -6326,13 +6326,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV32IDZFH-NEXT:    lui a0, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
-; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IDZFH-NEXT:    neg a0, a1
+; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
+; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
+; RV32IDZFH-NEXT:    neg a0, a0
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -6341,13 +6341,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV64IDZFH-NEXT:    lui a0, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
-; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IDZFH-NEXT:    neg a0, a1
+; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
+; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
+; RV64IDZFH-NEXT:    neg a0, a0
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -6505,13 +6505,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV32ID-ILP32-NEXT:    lui a0, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
-; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
-; RV32ID-ILP32-NEXT:    neg a0, a1
+; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
+; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
+; RV32ID-ILP32-NEXT:    neg a0, a0
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -6525,13 +6525,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI32_0)
-; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; RV64ID-LP64-NEXT:    lui a0, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
-; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
-; RV64ID-LP64-NEXT:    neg a0, a1
+; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
+; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
+; RV64ID-LP64-NEXT:    neg a0, a0
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -6545,13 +6545,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI32_0)(a1)
 ; RV32ID-NEXT:    lui a1, 815104
-; RV32ID-NEXT:    fmv.w.x fa4, a1
-; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV32ID-NEXT:    fmv.w.x fa5, a1
+; RV32ID-NEXT:    lui a1, %hi(.LCPI32_0)
 ; RV32ID-NEXT:    neg a0, a0
-; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
+; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -6564,13 +6564,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI32_0)(a1)
 ; RV64ID-NEXT:    lui a1, 815104
-; RV64ID-NEXT:    fmv.w.x fa4, a1
-; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV64ID-NEXT:    fmv.w.x fa5, a1
+; RV64ID-NEXT:    lui a1, %hi(.LCPI32_0)
 ; RV64ID-NEXT:    neg a0, a0
-; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
+; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
 ; RV64ID-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -6580,13 +6580,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
-; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32-IZFHMIN-NEXT:    neg a0, a1
+; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    neg a0, a0
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6595,13 +6595,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
-; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
 ; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64-IZFHMIN-NEXT:    neg a0, a1
+; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
+; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    neg a0, a0
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6816,45 +6816,45 @@ define zeroext i16 @fcvt_wu_s_i16(half %a) nounwind {
 define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZFH:       # %bb.0: # %start
+; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFH-NEXT:    fmv.w.x fa4, zero
 ; RV32IZFH-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV32IZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV32IZFH-NEXT:    fmv.w.x fa3, zero
-; RV32IZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV32IZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZFH:       # %bb.0: # %start
+; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFH-NEXT:    fmv.w.x fa4, zero
 ; RV64IZFH-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV64IZFH-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV64IZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV64IZFH-NEXT:    fmv.w.x fa3, zero
-; RV64IZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV64IZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64IZFH-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IDZFH:       # %bb.0: # %start
+; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
 ; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV32IDZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV32IDZFH-NEXT:    fmv.w.x fa3, zero
-; RV32IDZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV32IDZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32IDZFH-NEXT:    ret
 ;
 ; RV64IDZFH-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IDZFH:       # %bb.0: # %start
+; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IDZFH-NEXT:    fmv.w.x fa4, zero
 ; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV64IDZFH-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV64IDZFH-NEXT:    fcvt.s.h fa4, fa0
-; RV64IDZFH-NEXT:    fmv.w.x fa3, zero
-; RV64IDZFH-NEXT:    fmax.s fa4, fa4, fa3
-; RV64IDZFH-NEXT:    fmin.s fa5, fa4, fa5
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64IDZFH-NEXT:    ret
 ;
@@ -6985,12 +6985,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI34_0)(a1)
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, zero
-; RV32ID-ILP32-NEXT:    fmax.s fa4, fa4, fa3
-; RV32ID-ILP32-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, zero
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI34_0)
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    addi sp, sp, 16
@@ -7001,12 +7001,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV64ID-LP64-NEXT:    flw fa5, %lo(.LCPI34_0)(a1)
-; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, zero
-; RV64ID-LP64-NEXT:    fmax.s fa4, fa4, fa3
-; RV64ID-LP64-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, zero
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI34_0)
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64ID-LP64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64ID-LP64-NEXT:    addi sp, sp, 16
@@ -7017,11 +7017,11 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32ID-NEXT:    addi sp, sp, -16
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
+; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV32ID-NEXT:    fmv.w.x fa4, zero
-; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
-; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    addi sp, sp, 16
@@ -7032,11 +7032,11 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64ID-NEXT:    addi sp, sp, -16
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
+; RV64ID-NEXT:    fmv.w.x fa5, zero
 ; RV64ID-NEXT:    lui a0, %hi(.LCPI34_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; RV64ID-NEXT:    fmv.w.x fa4, zero
-; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
-; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
+; RV64ID-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; RV64ID-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    addi sp, sp, 16
@@ -7044,23 +7044,23 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ;
 ; CHECK32-IZFHMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
+; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    ret
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
+; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, zero
 ; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI34_0)(a0)
-; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, zero
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa4, fa4, fa3
-; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa4, fa5
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI34_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    ret
 ;
@@ -8595,16 +8595,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -32
 ; RV32IZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    lw a2, 0(a0)
-; RV32IZFH-NEXT:    lw a3, 4(a0)
-; RV32IZFH-NEXT:    lw a4, 8(a0)
-; RV32IZFH-NEXT:    lw a5, 12(a0)
 ; RV32IZFH-NEXT:    mv s0, a1
+; RV32IZFH-NEXT:    lw a1, 0(a0)
+; RV32IZFH-NEXT:    lw a2, 4(a0)
+; RV32IZFH-NEXT:    lw a3, 8(a0)
+; RV32IZFH-NEXT:    lw a0, 12(a0)
+; RV32IZFH-NEXT:    sw a1, 8(sp)
+; RV32IZFH-NEXT:    sw a2, 12(sp)
+; RV32IZFH-NEXT:    sw a3, 16(sp)
+; RV32IZFH-NEXT:    sw a0, 20(sp)
 ; RV32IZFH-NEXT:    addi a0, sp, 8
-; RV32IZFH-NEXT:    sw a2, 8(sp)
-; RV32IZFH-NEXT:    sw a3, 12(sp)
-; RV32IZFH-NEXT:    sw a4, 16(sp)
-; RV32IZFH-NEXT:    sw a5, 20(sp)
 ; RV32IZFH-NEXT:    call __trunctfhf2
 ; RV32IZFH-NEXT:    fsh fa0, 0(s0)
 ; RV32IZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8630,16 +8630,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32IDZFH-NEXT:    addi sp, sp, -32
 ; RV32IDZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32IDZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    lw a2, 0(a0)
-; RV32IDZFH-NEXT:    lw a3, 4(a0)
-; RV32IDZFH-NEXT:    lw a4, 8(a0)
-; RV32IDZFH-NEXT:    lw a5, 12(a0)
 ; RV32IDZFH-NEXT:    mv s0, a1
+; RV32IDZFH-NEXT:    lw a1, 0(a0)
+; RV32IDZFH-NEXT:    lw a2, 4(a0)
+; RV32IDZFH-NEXT:    lw a3, 8(a0)
+; RV32IDZFH-NEXT:    lw a0, 12(a0)
+; RV32IDZFH-NEXT:    sw a1, 8(sp)
+; RV32IDZFH-NEXT:    sw a2, 12(sp)
+; RV32IDZFH-NEXT:    sw a3, 16(sp)
+; RV32IDZFH-NEXT:    sw a0, 20(sp)
 ; RV32IDZFH-NEXT:    addi a0, sp, 8
-; RV32IDZFH-NEXT:    sw a2, 8(sp)
-; RV32IDZFH-NEXT:    sw a3, 12(sp)
-; RV32IDZFH-NEXT:    sw a4, 16(sp)
-; RV32IDZFH-NEXT:    sw a5, 20(sp)
 ; RV32IDZFH-NEXT:    call __trunctfhf2
 ; RV32IDZFH-NEXT:    fsh fa0, 0(s0)
 ; RV32IDZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8665,16 +8665,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32IZHINX-NEXT:    addi sp, sp, -32
 ; RV32IZHINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32IZHINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZHINX-NEXT:    lw a2, 0(a0)
-; RV32IZHINX-NEXT:    lw a3, 4(a0)
-; RV32IZHINX-NEXT:    lw a4, 8(a0)
-; RV32IZHINX-NEXT:    lw a5, 12(a0)
 ; RV32IZHINX-NEXT:    mv s0, a1
+; RV32IZHINX-NEXT:    lw a1, 0(a0)
+; RV32IZHINX-NEXT:    lw a2, 4(a0)
+; RV32IZHINX-NEXT:    lw a3, 8(a0)
+; RV32IZHINX-NEXT:    lw a0, 12(a0)
+; RV32IZHINX-NEXT:    sw a1, 8(sp)
+; RV32IZHINX-NEXT:    sw a2, 12(sp)
+; RV32IZHINX-NEXT:    sw a3, 16(sp)
+; RV32IZHINX-NEXT:    sw a0, 20(sp)
 ; RV32IZHINX-NEXT:    addi a0, sp, 8
-; RV32IZHINX-NEXT:    sw a2, 8(sp)
-; RV32IZHINX-NEXT:    sw a3, 12(sp)
-; RV32IZHINX-NEXT:    sw a4, 16(sp)
-; RV32IZHINX-NEXT:    sw a5, 20(sp)
 ; RV32IZHINX-NEXT:    call __trunctfhf2
 ; RV32IZHINX-NEXT:    sh a0, 0(s0)
 ; RV32IZHINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8700,16 +8700,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32IZDINXZHINX-NEXT:    addi sp, sp, -32
 ; RV32IZDINXZHINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32IZDINXZHINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZDINXZHINX-NEXT:    lw a2, 0(a0)
-; RV32IZDINXZHINX-NEXT:    lw a3, 4(a0)
-; RV32IZDINXZHINX-NEXT:    lw a4, 8(a0)
-; RV32IZDINXZHINX-NEXT:    lw a5, 12(a0)
 ; RV32IZDINXZHINX-NEXT:    mv s0, a1
+; RV32IZDINXZHINX-NEXT:    lw a1, 0(a0)
+; RV32IZDINXZHINX-NEXT:    lw a2, 4(a0)
+; RV32IZDINXZHINX-NEXT:    lw a3, 8(a0)
+; RV32IZDINXZHINX-NEXT:    lw a0, 12(a0)
+; RV32IZDINXZHINX-NEXT:    sw a1, 8(sp)
+; RV32IZDINXZHINX-NEXT:    sw a2, 12(sp)
+; RV32IZDINXZHINX-NEXT:    sw a3, 16(sp)
+; RV32IZDINXZHINX-NEXT:    sw a0, 20(sp)
 ; RV32IZDINXZHINX-NEXT:    addi a0, sp, 8
-; RV32IZDINXZHINX-NEXT:    sw a2, 8(sp)
-; RV32IZDINXZHINX-NEXT:    sw a3, 12(sp)
-; RV32IZDINXZHINX-NEXT:    sw a4, 16(sp)
-; RV32IZDINXZHINX-NEXT:    sw a5, 20(sp)
 ; RV32IZDINXZHINX-NEXT:    call __trunctfhf2
 ; RV32IZDINXZHINX-NEXT:    sh a0, 0(s0)
 ; RV32IZDINXZHINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8735,16 +8735,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a2, 0(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a4, 8(a0)
-; RV32I-NEXT:    lw a5, 12(a0)
 ; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    lw a1, 0(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
+; RV32I-NEXT:    lw a3, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sw a1, 8(sp)
+; RV32I-NEXT:    sw a2, 12(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    addi a0, sp, 8
-; RV32I-NEXT:    sw a2, 8(sp)
-; RV32I-NEXT:    sw a3, 12(sp)
-; RV32I-NEXT:    sw a4, 16(sp)
-; RV32I-NEXT:    sw a5, 20(sp)
 ; RV32I-NEXT:    call __trunctfhf2
 ; RV32I-NEXT:    sh a0, 0(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8770,16 +8770,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -32
 ; RV32ID-ILP32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32ID-ILP32-NEXT:    lw a2, 0(a0)
-; RV32ID-ILP32-NEXT:    lw a3, 4(a0)
-; RV32ID-ILP32-NEXT:    lw a4, 8(a0)
-; RV32ID-ILP32-NEXT:    lw a5, 12(a0)
 ; RV32ID-ILP32-NEXT:    mv s0, a1
+; RV32ID-ILP32-NEXT:    lw a1, 0(a0)
+; RV32ID-ILP32-NEXT:    lw a2, 4(a0)
+; RV32ID-ILP32-NEXT:    lw a3, 8(a0)
+; RV32ID-ILP32-NEXT:    lw a0, 12(a0)
+; RV32ID-ILP32-NEXT:    sw a1, 8(sp)
+; RV32ID-ILP32-NEXT:    sw a2, 12(sp)
+; RV32ID-ILP32-NEXT:    sw a3, 16(sp)
+; RV32ID-ILP32-NEXT:    sw a0, 20(sp)
 ; RV32ID-ILP32-NEXT:    addi a0, sp, 8
-; RV32ID-ILP32-NEXT:    sw a2, 8(sp)
-; RV32ID-ILP32-NEXT:    sw a3, 12(sp)
-; RV32ID-ILP32-NEXT:    sw a4, 16(sp)
-; RV32ID-ILP32-NEXT:    sw a5, 20(sp)
 ; RV32ID-ILP32-NEXT:    call __trunctfhf2
 ; RV32ID-ILP32-NEXT:    sh a0, 0(s0)
 ; RV32ID-ILP32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8805,16 +8805,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; RV32ID-NEXT:    addi sp, sp, -32
 ; RV32ID-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    lw a2, 0(a0)
-; RV32ID-NEXT:    lw a3, 4(a0)
-; RV32ID-NEXT:    lw a4, 8(a0)
-; RV32ID-NEXT:    lw a5, 12(a0)
 ; RV32ID-NEXT:    mv s0, a1
+; RV32ID-NEXT:    lw a1, 0(a0)
+; RV32ID-NEXT:    lw a2, 4(a0)
+; RV32ID-NEXT:    lw a3, 8(a0)
+; RV32ID-NEXT:    lw a0, 12(a0)
+; RV32ID-NEXT:    sw a1, 8(sp)
+; RV32ID-NEXT:    sw a2, 12(sp)
+; RV32ID-NEXT:    sw a3, 16(sp)
+; RV32ID-NEXT:    sw a0, 20(sp)
 ; RV32ID-NEXT:    addi a0, sp, 8
-; RV32ID-NEXT:    sw a2, 8(sp)
-; RV32ID-NEXT:    sw a3, 12(sp)
-; RV32ID-NEXT:    sw a4, 16(sp)
-; RV32ID-NEXT:    sw a5, 20(sp)
 ; RV32ID-NEXT:    call __trunctfhf2
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-NEXT:    sh a0, 0(s0)
@@ -8842,16 +8842,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; CHECK32-IZFHMIN-NEXT:    addi sp, sp, -32
 ; CHECK32-IZFHMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK32-IZFHMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; CHECK32-IZFHMIN-NEXT:    lw a2, 0(a0)
-; CHECK32-IZFHMIN-NEXT:    lw a3, 4(a0)
-; CHECK32-IZFHMIN-NEXT:    lw a4, 8(a0)
-; CHECK32-IZFHMIN-NEXT:    lw a5, 12(a0)
 ; CHECK32-IZFHMIN-NEXT:    mv s0, a1
+; CHECK32-IZFHMIN-NEXT:    lw a1, 0(a0)
+; CHECK32-IZFHMIN-NEXT:    lw a2, 4(a0)
+; CHECK32-IZFHMIN-NEXT:    lw a3, 8(a0)
+; CHECK32-IZFHMIN-NEXT:    lw a0, 12(a0)
+; CHECK32-IZFHMIN-NEXT:    sw a1, 8(sp)
+; CHECK32-IZFHMIN-NEXT:    sw a2, 12(sp)
+; CHECK32-IZFHMIN-NEXT:    sw a3, 16(sp)
+; CHECK32-IZFHMIN-NEXT:    sw a0, 20(sp)
 ; CHECK32-IZFHMIN-NEXT:    addi a0, sp, 8
-; CHECK32-IZFHMIN-NEXT:    sw a2, 8(sp)
-; CHECK32-IZFHMIN-NEXT:    sw a3, 12(sp)
-; CHECK32-IZFHMIN-NEXT:    sw a4, 16(sp)
-; CHECK32-IZFHMIN-NEXT:    sw a5, 20(sp)
 ; CHECK32-IZFHMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZFHMIN-NEXT:    fsh fa0, 0(s0)
 ; CHECK32-IZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8877,16 +8877,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    addi sp, sp, -32
 ; CHECK32-IZHINXMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK32-IZHINXMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; CHECK32-IZHINXMIN-NEXT:    lw a2, 0(a0)
-; CHECK32-IZHINXMIN-NEXT:    lw a3, 4(a0)
-; CHECK32-IZHINXMIN-NEXT:    lw a4, 8(a0)
-; CHECK32-IZHINXMIN-NEXT:    lw a5, 12(a0)
 ; CHECK32-IZHINXMIN-NEXT:    mv s0, a1
+; CHECK32-IZHINXMIN-NEXT:    lw a1, 0(a0)
+; CHECK32-IZHINXMIN-NEXT:    lw a2, 4(a0)
+; CHECK32-IZHINXMIN-NEXT:    lw a3, 8(a0)
+; CHECK32-IZHINXMIN-NEXT:    lw a0, 12(a0)
+; CHECK32-IZHINXMIN-NEXT:    sw a1, 8(sp)
+; CHECK32-IZHINXMIN-NEXT:    sw a2, 12(sp)
+; CHECK32-IZHINXMIN-NEXT:    sw a3, 16(sp)
+; CHECK32-IZHINXMIN-NEXT:    sw a0, 20(sp)
 ; CHECK32-IZHINXMIN-NEXT:    addi a0, sp, 8
-; CHECK32-IZHINXMIN-NEXT:    sw a2, 8(sp)
-; CHECK32-IZHINXMIN-NEXT:    sw a3, 12(sp)
-; CHECK32-IZHINXMIN-NEXT:    sw a4, 16(sp)
-; CHECK32-IZHINXMIN-NEXT:    sw a5, 20(sp)
 ; CHECK32-IZHINXMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZHINXMIN-NEXT:    sh a0, 0(s0)
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -8912,16 +8912,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi sp, sp, -32
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a2, 0(a0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a3, 4(a0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a4, 8(a0)
-; CHECK32-IZDINXZHINXMIN-NEXT:    lw a5, 12(a0)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    mv s0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    lw a1, 0(a0)
+; CHECK32-IZDINXZHINXMIN-NEXT:    lw a2, 4(a0)
+; CHECK32-IZDINXZHINXMIN-NEXT:    lw a3, 8(a0)
+; CHECK32-IZDINXZHINXMIN-NEXT:    lw a0, 12(a0)
+; CHECK32-IZDINXZHINXMIN-NEXT:    sw a1, 8(sp)
+; CHECK32-IZDINXZHINXMIN-NEXT:    sw a2, 12(sp)
+; CHECK32-IZDINXZHINXMIN-NEXT:    sw a3, 16(sp)
+; CHECK32-IZDINXZHINXMIN-NEXT:    sw a0, 20(sp)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a0, sp, 8
-; CHECK32-IZDINXZHINXMIN-NEXT:    sw a2, 8(sp)
-; CHECK32-IZDINXZHINXMIN-NEXT:    sw a3, 12(sp)
-; CHECK32-IZDINXZHINXMIN-NEXT:    sw a4, 16(sp)
-; CHECK32-IZDINXZHINXMIN-NEXT:    sw a5, 20(sp)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __trunctfhf2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    sh a0, 0(s0)
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
index 12cf088e3205f..7754f5b8f9f3a 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
@@ -222,8 +222,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    flt.h a2, fa1, fa0
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    or a0, a2, a1
 ; CHECK-NEXT:    feq.h zero, fa1, fa0
+; CHECK-NEXT:    or a0, a2, a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_one:
@@ -235,9 +235,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    flt.h a4, a1, a0
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    or a2, a4, a3
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    or a0, a4, a3
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_one:
@@ -249,23 +248,23 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    flt.h a2, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    fsflags a0
-; CHECKIZFHMIN-NEXT:    or a0, a2, a1
 ; CHECKIZFHMIN-NEXT:    feq.h zero, fa1, fa0
+; CHECKIZFHMIN-NEXT:    or a0, a2, a1
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_one:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a3, a2, a1
-; CHECKIZHINXMIN-NEXT:    fsflags a0
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a2, a1
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a4, a1, a2
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a3, a0, a1
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a0, a1
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a4, a1, a0
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a0
 ; CHECKIZHINXMIN-NEXT:    or a0, a4, a3
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a2
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
@@ -319,9 +318,9 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    flt.h a2, fa1, fa0
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    feq.h zero, fa1, fa0
 ; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    xori a0, a1, 1
-; CHECK-NEXT:    feq.h zero, fa1, fa0
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_ueq:
@@ -333,10 +332,9 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    flt.h a4, a1, a0
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    or a3, a4, a3
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    or a3, a4, a3
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ueq:
@@ -348,25 +346,25 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    flt.h a2, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    fsflags a0
+; CHECKIZFHMIN-NEXT:    feq.h zero, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    or a1, a2, a1
 ; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
-; CHECKIZFHMIN-NEXT:    feq.h zero, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_ueq:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a3, a2, a1
-; CHECKIZHINXMIN-NEXT:    fsflags a0
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a2, a1
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a4, a1, a2
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a3, a0, a1
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a0, a1
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a4, a1, a0
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a0
 ; CHECKIZHINXMIN-NEXT:    or a3, a4, a3
 ; CHECKIZHINXMIN-NEXT:    xori a0, a3, 1
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a2
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
@@ -379,8 +377,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    fle.h a1, fa0, fa1
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    feq.h zero, fa0, fa1
+; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_ugt:
@@ -388,9 +386,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    fle.h a3, a0, a1
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a0, a1
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ugt:
@@ -398,19 +395,19 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    fle.h a1, fa0, fa1
 ; CHECKIZFHMIN-NEXT:    fsflags a0
-; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    feq.h zero, fa0, fa1
+; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_ugt:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    fle.s a3, a2, a1
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    fle.s a3, a0, a1
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a3, 1
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a2, a1
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
@@ -423,8 +420,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    flt.h a1, fa0, fa1
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    feq.h zero, fa0, fa1
+; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_uge:
@@ -432,9 +429,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    flt.h a3, a0, a1
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a0, a1
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_uge:
@@ -442,19 +438,19 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    flt.h a1, fa0, fa1
 ; CHECKIZFHMIN-NEXT:    fsflags a0
-; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    feq.h zero, fa0, fa1
+; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_uge:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a3, a2, a1
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a3, a0, a1
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a3, 1
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a2, a1
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
@@ -467,8 +463,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    fle.h a1, fa1, fa0
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    feq.h zero, fa1, fa0
+; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_ult:
@@ -476,9 +472,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    fle.h a3, a1, a0
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ult:
@@ -486,19 +481,19 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    fle.h a1, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    fsflags a0
-; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    feq.h zero, fa1, fa0
+; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_ult:
 ; CHECKIZHINXMIN:       # %bb.0:
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    fle.s a3, a1, a2
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    fle.s a3, a1, a0
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a0
 ; CHECKIZHINXMIN-NEXT:    xori a0, a3, 1
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a2
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
@@ -511,8 +506,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    flt.h a1, fa1, fa0
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    feq.h zero, fa1, fa0
+; CHECK-NEXT:    xori a0, a1, 1
 ; CHECK-NEXT:    ret
 ;
 ; CHECKIZHINX-LABEL: fcmp_ule:
@@ -520,9 +515,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINX-NEXT:    frflags a2
 ; CHECKIZHINX-NEXT:    flt.h a3, a1, a0
 ; CHECKIZHINX-NEXT:    fsflags a2
-; CHECKIZHINX-NEXT:    xori a2, a3, 1
 ; CHECKIZHINX-NEXT:    feq.h zero, a1, a0
-; CHECKIZHINX-NEXT:    mv a0, a2
+; CHECKIZHINX-NEXT:    xori a0, a3, 1
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: fcmp_ule:
@@ -530,19 +524,19 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp {
 ; CHECKIZFHMIN-NEXT:    frflags a0
 ; CHECKIZFHMIN-NEXT:    flt.h a1, fa1, fa0
 ; CHECKIZFHMIN-NEXT:    fsflags a0
-; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    feq.h zero, fa1, fa0
+; CHECKIZFHMIN-NEXT:    xori a0, a1, 1
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fcmp_ule:
 ; CHECKIZHINXMIN:       # %bb.0:
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a0
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    frflags a0
-; CHECKIZHINXMIN-NEXT:    flt.s a3, a1, a2
-; CHECKIZHINXMIN-NEXT:    fsflags a0
+; CHECKIZHINXMIN-NEXT:    frflags a2
+; CHECKIZHINXMIN-NEXT:    flt.s a3, a1, a0
+; CHECKIZHINXMIN-NEXT:    fsflags a2
+; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a0
 ; CHECKIZHINXMIN-NEXT:    xori a0, a3, 1
-; CHECKIZHINXMIN-NEXT:    feq.s zero, a1, a2
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
   %2 = zext i1 %1 to i32
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index 7fcad77c7c17b..5d5f58278235c 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -3439,8 +3439,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
 ; RV32IZFH-NEXT:    addi a0, sp, 8
 ; RV32IZFH-NEXT:    call frexpf
-; RV32IZFH-NEXT:    lw a0, 8(sp)
 ; RV32IZFH-NEXT:    fcvt.h.s fa0, fa0
+; RV32IZFH-NEXT:    lw a0, 8(sp)
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
 ; RV32IZFH-NEXT:    ret
@@ -3452,8 +3452,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
 ; RV64IZFH-NEXT:    mv a0, sp
 ; RV64IZFH-NEXT:    call frexpf
-; RV64IZFH-NEXT:    ld a0, 0(sp)
 ; RV64IZFH-NEXT:    fcvt.h.s fa0, fa0
+; RV64IZFH-NEXT:    ld a0, 0(sp)
 ; RV64IZFH-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IZFH-NEXT:    addi sp, sp, 16
 ; RV64IZFH-NEXT:    ret
@@ -3465,8 +3465,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    addi a1, sp, 8
 ; RV32IZHINX-NEXT:    call frexpf
-; RV32IZHINX-NEXT:    lw a1, 8(sp)
 ; RV32IZHINX-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINX-NEXT:    lw a1, 8(sp)
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
 ; RV32IZHINX-NEXT:    ret
@@ -3478,8 +3478,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    mv a1, sp
 ; RV64IZHINX-NEXT:    call frexpf
-; RV64IZHINX-NEXT:    ld a1, 0(sp)
 ; RV64IZHINX-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINX-NEXT:    ld a1, 0(sp)
 ; RV64IZHINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IZHINX-NEXT:    addi sp, sp, 16
 ; RV64IZHINX-NEXT:    ret
@@ -3521,8 +3521,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
 ; RV32IZFHMIN-NEXT:    addi a0, sp, 8
 ; RV32IZFHMIN-NEXT:    call frexpf
-; RV32IZFHMIN-NEXT:    lw a0, 8(sp)
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa0
+; RV32IZFHMIN-NEXT:    lw a0, 8(sp)
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
 ; RV32IZFHMIN-NEXT:    ret
@@ -3534,8 +3534,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
 ; RV64IZFHMIN-NEXT:    mv a0, sp
 ; RV64IZFHMIN-NEXT:    call frexpf
-; RV64IZFHMIN-NEXT:    ld a0, 0(sp)
 ; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa0
+; RV64IZFHMIN-NEXT:    ld a0, 0(sp)
 ; RV64IZFHMIN-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IZFHMIN-NEXT:    addi sp, sp, 16
 ; RV64IZFHMIN-NEXT:    ret
@@ -3547,8 +3547,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINXMIN-NEXT:    addi a1, sp, 8
 ; RV32IZHINXMIN-NEXT:    call frexpf
-; RV32IZHINXMIN-NEXT:    lw a1, 8(sp)
 ; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    lw a1, 8(sp)
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
 ; RV32IZHINXMIN-NEXT:    ret
@@ -3560,8 +3560,8 @@ define {half, i32} @frexp_half(half %x) nounwind {
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINXMIN-NEXT:    mv a1, sp
 ; RV64IZHINXMIN-NEXT:    call frexpf
-; RV64IZHINXMIN-NEXT:    ld a1, 0(sp)
 ; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    ld a1, 0(sp)
 ; RV64IZHINXMIN-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
 ; RV64IZHINXMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll
index 9ac2a4d037f8a..a910bb9eec875 100644
--- a/llvm/test/CodeGen/RISCV/half-mem.ll
+++ b/llvm/test/CodeGen/RISCV/half-mem.ll
@@ -33,21 +33,21 @@ define half @flh(ptr %a) nounwind {
 ;
 ; CHECKIZFHMIN-LABEL: flh:
 ; CHECKIZFHMIN:       # %bb.0:
-; CHECKIZFHMIN-NEXT:    flh fa5, 6(a0)
-; CHECKIZFHMIN-NEXT:    flh fa4, 0(a0)
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    flh fa5, 0(a0)
+; CHECKIZFHMIN-NEXT:    flh fa4, 6(a0)
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: flh:
 ; CHECKIZHINXMIN:       # %bb.0:
-; CHECKIZHINXMIN-NEXT:    lh a1, 6(a0)
-; CHECKIZHINXMIN-NEXT:    lh a0, 0(a0)
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    lh a1, 0(a0)
+; CHECKIZHINXMIN-NEXT:    lh a0, 6(a0)
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a1, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = load half, ptr %a
diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
index d92dcb9eac4c6..9aff2d434689f 100644
--- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
@@ -737,12 +737,12 @@ define i32 @i32_select_fcmp_oeq(half %a, half %b, i32 %c, i32 %d) nounwind {
 ;
 ; CHECKIZHINX-LABEL: i32_select_fcmp_oeq:
 ; CHECKIZHINX:       # %bb.0:
-; CHECKIZHINX-NEXT:    feq.h a1, a0, a1
-; CHECKIZHINX-NEXT:    mv a0, a2
-; CHECKIZHINX-NEXT:    bnez a1, .LBB16_2
+; CHECKIZHINX-NEXT:    feq.h a0, a0, a1
+; CHECKIZHINX-NEXT:    bnez a0, .LBB16_2
 ; CHECKIZHINX-NEXT:  # %bb.1:
-; CHECKIZHINX-NEXT:    mv a0, a3
+; CHECKIZHINX-NEXT:    mv a2, a3
 ; CHECKIZHINX-NEXT:  .LBB16_2:
+; CHECKIZHINX-NEXT:    mv a0, a2
 ; CHECKIZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: i32_select_fcmp_oeq:
@@ -760,12 +760,12 @@ define i32 @i32_select_fcmp_oeq(half %a, half %b, i32 %c, i32 %d) nounwind {
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a0, a1
-; CHECKIZHINXMIN-NEXT:    mv a0, a2
-; CHECKIZHINXMIN-NEXT:    bnez a1, .LBB16_2
+; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    bnez a0, .LBB16_2
 ; CHECKIZHINXMIN-NEXT:  # %bb.1:
-; CHECKIZHINXMIN-NEXT:    mv a0, a3
+; CHECKIZHINXMIN-NEXT:    mv a2, a3
 ; CHECKIZHINXMIN-NEXT:  .LBB16_2:
+; CHECKIZHINXMIN-NEXT:    mv a0, a2
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = fcmp oeq half %a, %b
   %2 = select i1 %1, i32 %c, i32 %d
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 66cde323ce507..00fac434517c4 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
 define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a3, .LBB8_2
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    bgez a1, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    snez a6, a4
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    snez a6, a3
 ; RV32I-NEXT:    snez a7, a2
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    sltu a3, a5, a6
-; RV32I-NEXT:    neg a7, a1
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sltu a7, a5, a6
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    sub a1, a1, a7
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB8_2:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a3, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 4(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a3, .LBB8_2
+; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    bgez a1, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    snez a6, a4
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    snez a6, a3
 ; RV32ZBB-NEXT:    snez a7, a2
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
-; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    sltu a3, a5, a6
-; RV32ZBB-NEXT:    neg a7, a1
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    add a1, a1, a4
+; RV32ZBB-NEXT:    sub a3, a3, a7
+; RV32ZBB-NEXT:    sltu a7, a5, a6
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB8_2:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a4, 4(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
 define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a3, .LBB9_2
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    bgez a1, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    snez a6, a4
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    snez a6, a3
 ; RV32I-NEXT:    snez a7, a2
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    sltu a3, a5, a6
-; RV32I-NEXT:    neg a7, a1
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sltu a7, a5, a6
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    sub a1, a1, a7
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB9_2:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a3, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 4(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a3, .LBB9_2
+; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    bgez a1, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    snez a6, a4
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    snez a6, a3
 ; RV32ZBB-NEXT:    snez a7, a2
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
-; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    sltu a3, a5, a6
-; RV32ZBB-NEXT:    neg a7, a1
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    add a1, a1, a4
+; RV32ZBB-NEXT:    sub a3, a3, a7
+; RV32ZBB-NEXT:    sltu a7, a5, a6
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    sub a1, a1, a7
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB9_2:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a4, 4(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: select_abs128:
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
index d58e6fe7675da..bbc4c3735de45 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
@@ -26,11 +26,11 @@ define double @constraint_f_double(double %a) nounwind {
 ;
 ; RV64F-LABEL: constraint_f_double:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gd)
-; RV64F-NEXT:    fld fa5, %lo(gd)(a1)
-; RV64F-NEXT:    fmv.d.x fa4, a0
+; RV64F-NEXT:    fmv.d.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gd)
+; RV64F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    fadd.d fa5, fa4, fa5
+; RV64F-NEXT:    fadd.d fa5, fa5, fa4
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.d a0, fa5
 ; RV64F-NEXT:    ret
@@ -59,11 +59,11 @@ define double @constraint_cf_double(double %a) nounwind {
 ;
 ; RV64F-LABEL: constraint_cf_double:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gd)
-; RV64F-NEXT:    fld fa5, %lo(gd)(a1)
-; RV64F-NEXT:    fmv.d.x fa4, a0
+; RV64F-NEXT:    fmv.d.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gd)
+; RV64F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    fadd.d fa5, fa4, fa5
+; RV64F-NEXT:    fadd.d fa5, fa5, fa4
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.d a0, fa5
 ; RV64F-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
index 238a0fa0b6fd7..144ddb99e5c4c 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
@@ -29,11 +29,11 @@ define double @constraint_f_double(double %a) nounwind {
 ;
 ; RV64F-LABEL: constraint_f_double:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gd)
-; RV64F-NEXT:    fld fa5, %lo(gd)(a1)
-; RV64F-NEXT:    fmv.d.x fa4, a0
+; RV64F-NEXT:    fmv.d.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gd)
+; RV64F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV64F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.d a0, fa5
 ; RV64F-NEXT:    ret
@@ -62,11 +62,11 @@ define double @constraint_cf_double(double %a) nounwind {
 ;
 ; RV64F-LABEL: constraint_cf_double:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gd)
-; RV64F-NEXT:    fld fa5, %lo(gd)(a1)
-; RV64F-NEXT:    fmv.d.x fa4, a0
+; RV64F-NEXT:    fmv.d.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gd)
+; RV64F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV64F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.d a0, fa5
 ; RV64F-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll
index f17f5ba15c605..8ed247d1398ad 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll
@@ -13,22 +13,22 @@
 define float @constraint_f_float(float %a) nounwind {
 ; RV32F-LABEL: constraint_f_float:
 ; RV32F:       # %bb.0:
-; RV32F-NEXT:    lui a1, %hi(gf)
-; RV32F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV32F-NEXT:    fmv.w.x fa4, a0
+; RV32F-NEXT:    fmv.w.x fa5, a0
+; RV32F-NEXT:    lui a0, %hi(gf)
+; RV32F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV32F-NEXT:    #APP
-; RV32F-NEXT:    fadd.s fa5, fa4, fa5
+; RV32F-NEXT:    fadd.s fa5, fa5, fa4
 ; RV32F-NEXT:    #NO_APP
 ; RV32F-NEXT:    fmv.x.w a0, fa5
 ; RV32F-NEXT:    ret
 ;
 ; RV64F-LABEL: constraint_f_float:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gf)
-; RV64F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV64F-NEXT:    fmv.w.x fa4, a0
+; RV64F-NEXT:    fmv.w.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gf)
+; RV64F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    fadd.s fa5, fa4, fa5
+; RV64F-NEXT:    fadd.s fa5, fa5, fa4
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.w a0, fa5
 ; RV64F-NEXT:    ret
@@ -40,22 +40,22 @@ define float @constraint_f_float(float %a) nounwind {
 define float @constraint_cf_float(float %a) nounwind {
 ; RV32F-LABEL: constraint_cf_float:
 ; RV32F:       # %bb.0:
-; RV32F-NEXT:    lui a1, %hi(gf)
-; RV32F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV32F-NEXT:    fmv.w.x fa4, a0
+; RV32F-NEXT:    fmv.w.x fa5, a0
+; RV32F-NEXT:    lui a0, %hi(gf)
+; RV32F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV32F-NEXT:    #APP
-; RV32F-NEXT:    fadd.s fa5, fa4, fa5
+; RV32F-NEXT:    fadd.s fa5, fa5, fa4
 ; RV32F-NEXT:    #NO_APP
 ; RV32F-NEXT:    fmv.x.w a0, fa5
 ; RV32F-NEXT:    ret
 ;
 ; RV64F-LABEL: constraint_cf_float:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gf)
-; RV64F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV64F-NEXT:    fmv.w.x fa4, a0
+; RV64F-NEXT:    fmv.w.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gf)
+; RV64F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    fadd.s fa5, fa4, fa5
+; RV64F-NEXT:    fadd.s fa5, fa5, fa4
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.w a0, fa5
 ; RV64F-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll
index a0de5c71a7df6..10ed6367a49c2 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll
@@ -16,22 +16,22 @@
 define float @constraint_f_modifier_N_float(float %a) nounwind {
 ; RV32F-LABEL: constraint_f_modifier_N_float:
 ; RV32F:       # %bb.0:
-; RV32F-NEXT:    lui a1, %hi(gf)
-; RV32F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV32F-NEXT:    fmv.w.x fa4, a0
+; RV32F-NEXT:    fmv.w.x fa5, a0
+; RV32F-NEXT:    lui a0, %hi(gf)
+; RV32F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV32F-NEXT:    #APP
-; RV32F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV32F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV32F-NEXT:    #NO_APP
 ; RV32F-NEXT:    fmv.x.w a0, fa5
 ; RV32F-NEXT:    ret
 ;
 ; RV64F-LABEL: constraint_f_modifier_N_float:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gf)
-; RV64F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV64F-NEXT:    fmv.w.x fa4, a0
+; RV64F-NEXT:    fmv.w.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gf)
+; RV64F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV64F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.w a0, fa5
 ; RV64F-NEXT:    ret
@@ -44,22 +44,22 @@ define float @constraint_f_modifier_N_float(float %a) nounwind {
 define float @constraint_cf_modifier_N_float(float %a) nounwind {
 ; RV32F-LABEL: constraint_cf_modifier_N_float:
 ; RV32F:       # %bb.0:
-; RV32F-NEXT:    lui a1, %hi(gf)
-; RV32F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV32F-NEXT:    fmv.w.x fa4, a0
+; RV32F-NEXT:    fmv.w.x fa5, a0
+; RV32F-NEXT:    lui a0, %hi(gf)
+; RV32F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV32F-NEXT:    #APP
-; RV32F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV32F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV32F-NEXT:    #NO_APP
 ; RV32F-NEXT:    fmv.x.w a0, fa5
 ; RV32F-NEXT:    ret
 ;
 ; RV64F-LABEL: constraint_cf_modifier_N_float:
 ; RV64F:       # %bb.0:
-; RV64F-NEXT:    lui a1, %hi(gf)
-; RV64F-NEXT:    flw fa5, %lo(gf)(a1)
-; RV64F-NEXT:    fmv.w.x fa4, a0
+; RV64F-NEXT:    fmv.w.x fa5, a0
+; RV64F-NEXT:    lui a0, %hi(gf)
+; RV64F-NEXT:    flw fa4, %lo(gf)(a0)
 ; RV64F-NEXT:    #APP
-; RV64F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20)
+; RV64F-NEXT:    .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20)
 ; RV64F-NEXT:    #NO_APP
 ; RV64F-NEXT:    fmv.x.w a0, fa5
 ; RV64F-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll
index 1c0de6c3f1612..4c15eaf7954d4 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll
@@ -57,9 +57,9 @@ define float @constraint_float_abi_name(float %a) nounwind {
 ; RV32FINX:       # %bb.0:
 ; RV32FINX-NEXT:    addi sp, sp, -16
 ; RV32FINX-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32FINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32FINX-NEXT:    lui a1, %hi(gf)
 ; RV32FINX-NEXT:    lw s0, %lo(gf)(a1)
-; RV32FINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32FINX-NEXT:    #APP
 ; RV32FINX-NEXT:    fadd.s t0, a0, s0
 ; RV32FINX-NEXT:    #NO_APP
@@ -72,9 +72,9 @@ define float @constraint_float_abi_name(float %a) nounwind {
 ; RV64FINX:       # %bb.0:
 ; RV64FINX-NEXT:    addi sp, sp, -16
 ; RV64FINX-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64FINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64FINX-NEXT:    lui a1, %hi(gf)
 ; RV64FINX-NEXT:    lw s0, %lo(gf)(a1)
-; RV64FINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64FINX-NEXT:    #APP
 ; RV64FINX-NEXT:    fadd.s t0, a0, s0
 ; RV64FINX-NEXT:    #NO_APP
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll
index 086d2a1d6f3b2..4482d68eba122 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll
@@ -97,9 +97,9 @@ define half @constraint_half_abi_name(half %a) nounwind {
 ; RV32ZHINX:       # %bb.0:
 ; RV32ZHINX-NEXT:    addi sp, sp, -16
 ; RV32ZHINX-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32ZHINX-NEXT:    lui a1, %hi(gh)
 ; RV32ZHINX-NEXT:    lh s0, %lo(gh)(a1)
-; RV32ZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32ZHINX-NEXT:    #APP
 ; RV32ZHINX-NEXT:    fadd.s t0, a0, s0
 ; RV32ZHINX-NEXT:    #NO_APP
@@ -112,9 +112,9 @@ define half @constraint_half_abi_name(half %a) nounwind {
 ; RV64ZHINX:       # %bb.0:
 ; RV64ZHINX-NEXT:    addi sp, sp, -16
 ; RV64ZHINX-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64ZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64ZHINX-NEXT:    lui a1, %hi(gh)
 ; RV64ZHINX-NEXT:    lh s0, %lo(gh)(a1)
-; RV64ZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64ZHINX-NEXT:    #APP
 ; RV64ZHINX-NEXT:    fadd.s t0, a0, s0
 ; RV64ZHINX-NEXT:    #NO_APP
@@ -127,9 +127,9 @@ define half @constraint_half_abi_name(half %a) nounwind {
 ; RV32DINXZHINX:       # %bb.0:
 ; RV32DINXZHINX-NEXT:    addi sp, sp, -16
 ; RV32DINXZHINX-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32DINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32DINXZHINX-NEXT:    lui a1, %hi(gh)
 ; RV32DINXZHINX-NEXT:    lh s0, %lo(gh)(a1)
-; RV32DINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32DINXZHINX-NEXT:    #APP
 ; RV32DINXZHINX-NEXT:    fadd.s t0, a0, s0
 ; RV32DINXZHINX-NEXT:    #NO_APP
@@ -142,9 +142,9 @@ define half @constraint_half_abi_name(half %a) nounwind {
 ; RV64DINXZHINX:       # %bb.0:
 ; RV64DINXZHINX-NEXT:    addi sp, sp, -16
 ; RV64DINXZHINX-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64DINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64DINXZHINX-NEXT:    lui a1, %hi(gh)
 ; RV64DINXZHINX-NEXT:    lh s0, %lo(gh)(a1)
-; RV64DINXZHINX-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64DINXZHINX-NEXT:    #APP
 ; RV64DINXZHINX-NEXT:    fadd.s t0, a0, s0
 ; RV64DINXZHINX-NEXT:    #NO_APP
diff --git a/llvm/test/CodeGen/RISCV/inline-asm.ll b/llvm/test/CodeGen/RISCV/inline-asm.ll
index 79266743a1d05..7382ab4d3d1c2 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm.ll
@@ -34,21 +34,21 @@ define i32 @constraint_r(i32 %a) nounwind {
 define i32 @constraint_r_zero(i32 %a) nounwind {
 ; RV32I-LABEL: constraint_r_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(gi)
-; RV32I-NEXT:    lw a0, %lo(gi)(a0)
-; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    lui a1, %hi(gi)
+; RV32I-NEXT:    lw a1, %lo(gi)(a1)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_r_zero:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(gi)
-; RV64I-NEXT:    lw a0, %lo(gi)(a0)
-; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    li a0, 0
+; RV64I-NEXT:    lui a1, %hi(gi)
+; RV64I-NEXT:    lw a1, %lo(gi)(a1)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
   %1 = load i32, ptr @gi
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 111b3e2bf82ce..391448b28c20b 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -75,15 +75,15 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    li a2, -1
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    li a3, -1
+; RV32-NEXT:    addi a4, sp, 32
+; RV32-NEXT:    vl2r.v v16, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vmsne.vi v0, v24, 0
+; RV32-NEXT:    vmsne.vi v0, v16, 0
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vmadd.vx v8, a2, v16
+; RV32-NEXT:    vmadd.vx v8, a3, v16
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vmerge.vim v16, v16, -1, v0
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index f60b77b92c09e..38cce2121c91d 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -30,12 +30,12 @@ entry:
 define void @test2(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test2:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a1)
-; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    lui a3, 524288
-; RV32-NEXT:    xor a2, a2, a3
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a1, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test2:
@@ -56,27 +56,27 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 12(a1)
-; RV32-NEXT:    lw a3, 0(a1)
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a4, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
 ; RV32-NEXT:    lui a5, 524288
-; RV32-NEXT:    xor a2, a2, a5
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a4, 4(a0)
-; RV32-NEXT:    sw a1, 8(a0)
-; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test3:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    ld a2, 8(a1)
-; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    ld a1, 8(a1)
 ; RV64-NEXT:    li a3, -1
 ; RV64-NEXT:    slli a3, a3, 63
-; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    sd a1, 0(a0)
-; RV64-NEXT:    sd a2, 8(a0)
+; RV64-NEXT:    xor a1, a1, a3
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sd a1, 8(a0)
 ; RV64-NEXT:    ret
 entry:
   %0 = load fp128, ptr %b
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index 7b199504837e8..51189ef60e852 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -143,12 +143,12 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV32IFD-NEXT:    .cfi_offset fs1, -32
 ; RV32IFD-NEXT:    .cfi_offset fs2, -40
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    lhu a0, 8(a1)
-; RV32IFD-NEXT:    lhu a2, 0(a1)
-; RV32IFD-NEXT:    lhu a1, 4(a1)
-; RV32IFD-NEXT:    fmv.w.x fs0, a0
-; RV32IFD-NEXT:    fmv.w.x fs1, a2
-; RV32IFD-NEXT:    fmv.w.x fa0, a1
+; RV32IFD-NEXT:    lhu a0, 0(a1)
+; RV32IFD-NEXT:    lhu a2, 4(a1)
+; RV32IFD-NEXT:    lhu a1, 8(a1)
+; RV32IFD-NEXT:    fmv.w.x fs0, a1
+; RV32IFD-NEXT:    fmv.w.x fs1, a0
+; RV32IFD-NEXT:    fmv.w.x fa0, a2
 ; RV32IFD-NEXT:    call __extendhfsf2
 ; RV32IFD-NEXT:    call exp10f
 ; RV32IFD-NEXT:    call __truncsfhf2
@@ -200,11 +200,11 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s1, -24
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
+; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    lhu s1, 0(a1)
-; RV64IFD-NEXT:    lhu a2, 8(a1)
+; RV64IFD-NEXT:    lhu a0, 8(a1)
 ; RV64IFD-NEXT:    lhu s2, 16(a1)
-; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a2
+; RV64IFD-NEXT:    fmv.w.x fa0, a0
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
@@ -267,14 +267,14 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV32IFD-NEXT:    .cfi_offset fs2, -48
 ; RV32IFD-NEXT:    .cfi_offset fs3, -56
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    lhu a0, 12(a1)
-; RV32IFD-NEXT:    lhu a2, 0(a1)
-; RV32IFD-NEXT:    lhu a3, 4(a1)
-; RV32IFD-NEXT:    lhu a1, 8(a1)
-; RV32IFD-NEXT:    fmv.w.x fs0, a0
-; RV32IFD-NEXT:    fmv.w.x fs1, a2
-; RV32IFD-NEXT:    fmv.w.x fs2, a3
-; RV32IFD-NEXT:    fmv.w.x fa0, a1
+; RV32IFD-NEXT:    lhu a0, 0(a1)
+; RV32IFD-NEXT:    lhu a2, 4(a1)
+; RV32IFD-NEXT:    lhu a3, 8(a1)
+; RV32IFD-NEXT:    lhu a1, 12(a1)
+; RV32IFD-NEXT:    fmv.w.x fs0, a1
+; RV32IFD-NEXT:    fmv.w.x fs1, a0
+; RV32IFD-NEXT:    fmv.w.x fs2, a2
+; RV32IFD-NEXT:    fmv.w.x fa0, a3
 ; RV32IFD-NEXT:    call __extendhfsf2
 ; RV32IFD-NEXT:    call exp10f
 ; RV32IFD-NEXT:    call __truncsfhf2
@@ -343,12 +343,12 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs0, -48
 ; RV64IFD-NEXT:    .cfi_offset fs1, -56
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
+; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    lhu s1, 0(a1)
 ; RV64IFD-NEXT:    lhu s2, 8(a1)
-; RV64IFD-NEXT:    lhu a2, 16(a1)
+; RV64IFD-NEXT:    lhu a0, 16(a1)
 ; RV64IFD-NEXT:    lhu s3, 24(a1)
-; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a2
+; RV64IFD-NEXT:    fmv.w.x fa0, a0
 ; RV64IFD-NEXT:    call __extendhfsf2
 ; RV64IFD-NEXT:    call exp10f
 ; RV64IFD-NEXT:    call __truncsfhf2
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 4a77b4d32cdda..28f56e49b6693 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -730,38 +730,37 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw s0, 4(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw s1, 4(a1)
+; RV32I-NEXT:    lw s2, 8(a1)
+; RV32I-NEXT:    lw s3, 12(a1)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call frexpf
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    addi a1, sp, 20
+; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    addi a1, sp, 20
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    lw a1, 8(sp)
 ; RV32I-NEXT:    lw a2, 12(sp)
 ; RV32I-NEXT:    lw a3, 16(sp)
 ; RV32I-NEXT:    lw a4, 20(sp)
-; RV32I-NEXT:    sw s4, 0(s3)
-; RV32I-NEXT:    sw s0, 4(s3)
-; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw a0, 12(s3)
-; RV32I-NEXT:    sw a1, 16(s3)
-; RV32I-NEXT:    sw a2, 20(s3)
-; RV32I-NEXT:    sw a3, 24(s3)
-; RV32I-NEXT:    sw a4, 28(s3)
+; RV32I-NEXT:    sw s4, 0(s0)
+; RV32I-NEXT:    sw s1, 4(s0)
+; RV32I-NEXT:    sw s2, 8(s0)
+; RV32I-NEXT:    sw a0, 12(s0)
+; RV32I-NEXT:    sw a1, 16(s0)
+; RV32I-NEXT:    sw a2, 20(s0)
+; RV32I-NEXT:    sw a3, 24(s0)
+; RV32I-NEXT:    sw a4, 28(s0)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -780,38 +779,37 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a2, 0(a1)
-; RV64I-NEXT:    lw s0, 8(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a0, 0(a1)
+; RV64I-NEXT:    lw s1, 8(a1)
+; RV64I-NEXT:    lw s2, 16(a1)
+; RV64I-NEXT:    lw s3, 24(a1)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call frexpf
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    lw a1, 0(sp)
 ; RV64I-NEXT:    lw a2, 4(sp)
 ; RV64I-NEXT:    lw a3, 8(sp)
 ; RV64I-NEXT:    lw a4, 12(sp)
-; RV64I-NEXT:    sw s4, 0(s3)
-; RV64I-NEXT:    sw s0, 4(s3)
-; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw a0, 12(s3)
-; RV64I-NEXT:    sw a1, 16(s3)
-; RV64I-NEXT:    sw a2, 20(s3)
-; RV64I-NEXT:    sw a3, 24(s3)
-; RV64I-NEXT:    sw a4, 28(s3)
+; RV64I-NEXT:    sw s4, 0(s0)
+; RV64I-NEXT:    sw s1, 4(s0)
+; RV64I-NEXT:    sw s2, 8(s0)
+; RV64I-NEXT:    sw a0, 12(s0)
+; RV64I-NEXT:    sw a1, 16(s0)
+; RV64I-NEXT:    sw a2, 20(s0)
+; RV64I-NEXT:    sw a3, 24(s0)
+; RV64I-NEXT:    sw a4, 28(s0)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
@@ -998,30 +996,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw s0, 4(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw s1, 4(a1)
+; RV32I-NEXT:    lw s2, 8(a1)
+; RV32I-NEXT:    lw s3, 12(a1)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call frexpf
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    addi a1, sp, 20
+; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
-; RV32I-NEXT:    sw s4, 0(s3)
-; RV32I-NEXT:    sw s0, 4(s3)
-; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw a0, 12(s3)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    addi a1, sp, 20
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call frexpf
+; RV32I-NEXT:    sw s4, 0(s0)
+; RV32I-NEXT:    sw s1, 4(s0)
+; RV32I-NEXT:    sw s2, 8(s0)
+; RV32I-NEXT:    sw a0, 12(s0)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -1040,30 +1037,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a2, 0(a1)
-; RV64I-NEXT:    lw s0, 8(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a0, 0(a1)
+; RV64I-NEXT:    lw s1, 8(a1)
+; RV64I-NEXT:    lw s2, 16(a1)
+; RV64I-NEXT:    lw s3, 24(a1)
 ; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call frexpf
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
-; RV64I-NEXT:    sw s4, 0(s3)
-; RV64I-NEXT:    sw s0, 4(s3)
-; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw a0, 12(s3)
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call frexpf
+; RV64I-NEXT:    sw s4, 0(s0)
+; RV64I-NEXT:    sw s1, 4(s0)
+; RV64I-NEXT:    sw s2, 8(s0)
+; RV64I-NEXT:    sw a0, 12(s0)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
@@ -1230,31 +1226,30 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw s0, 4(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw s1, 4(a1)
+; RV32I-NEXT:    lw s2, 8(a1)
+; RV32I-NEXT:    lw s3, 12(a1)
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 16
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    addi a1, sp, 24
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    call frexpf
 ; RV32I-NEXT:    lw a0, 12(sp)
 ; RV32I-NEXT:    lw a1, 16(sp)
 ; RV32I-NEXT:    lw a2, 20(sp)
 ; RV32I-NEXT:    lw a3, 24(sp)
-; RV32I-NEXT:    sw a0, 0(s3)
-; RV32I-NEXT:    sw a1, 4(s3)
-; RV32I-NEXT:    sw a2, 8(s3)
-; RV32I-NEXT:    sw a3, 12(s3)
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a3, 12(s0)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
@@ -1271,31 +1266,30 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a2, 0(a1)
-; RV64I-NEXT:    lw s0, 8(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a0, 0(a1)
+; RV64I-NEXT:    lw s1, 8(a1)
+; RV64I-NEXT:    lw s2, 16(a1)
+; RV64I-NEXT:    lw s3, 24(a1)
 ; RV64I-NEXT:    addi a1, sp, 8
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 16
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    addi a1, sp, 20
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s3
 ; RV64I-NEXT:    call frexpf
 ; RV64I-NEXT:    lw a0, 8(sp)
 ; RV64I-NEXT:    lw a1, 12(sp)
 ; RV64I-NEXT:    lw a2, 16(sp)
 ; RV64I-NEXT:    lw a3, 20(sp)
-; RV64I-NEXT:    sw a0, 0(s3)
-; RV64I-NEXT:    sw a1, 4(s3)
-; RV64I-NEXT:    sw a2, 8(s3)
-; RV64I-NEXT:    sw a3, 12(s3)
+; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    sw a1, 4(s0)
+; RV64I-NEXT:    sw a2, 8(s0)
+; RV64I-NEXT:    sw a3, 12(s0)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
@@ -1547,18 +1541,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -48
 ; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lw a3, 0(a1)
-; RV32IFD-NEXT:    lw a4, 4(a1)
-; RV32IFD-NEXT:    lw a5, 8(a1)
-; RV32IFD-NEXT:    lw a6, 12(a1)
 ; RV32IFD-NEXT:    mv s0, a0
+; RV32IFD-NEXT:    lw a0, 0(a1)
+; RV32IFD-NEXT:    lw a2, 4(a1)
+; RV32IFD-NEXT:    lw a3, 8(a1)
+; RV32IFD-NEXT:    lw a1, 12(a1)
+; RV32IFD-NEXT:    sw a0, 0(sp)
+; RV32IFD-NEXT:    sw a2, 4(sp)
+; RV32IFD-NEXT:    sw a3, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
 ; RV32IFD-NEXT:    addi a0, sp, 16
 ; RV32IFD-NEXT:    mv a1, sp
 ; RV32IFD-NEXT:    addi a2, sp, 36
-; RV32IFD-NEXT:    sw a3, 0(sp)
-; RV32IFD-NEXT:    sw a4, 4(sp)
-; RV32IFD-NEXT:    sw a5, 8(sp)
-; RV32IFD-NEXT:    sw a6, 12(sp)
 ; RV32IFD-NEXT:    call frexpl
 ; RV32IFD-NEXT:    lw a0, 36(sp)
 ; RV32IFD-NEXT:    lw a1, 16(sp)
@@ -1600,18 +1594,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
 ; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lw a3, 0(a1)
-; RV32IZFINXZDINX-NEXT:    lw a4, 4(a1)
-; RV32IZFINXZDINX-NEXT:    lw a5, 8(a1)
-; RV32IZFINXZDINX-NEXT:    lw a6, 12(a1)
 ; RV32IZFINXZDINX-NEXT:    mv s0, a0
+; RV32IZFINXZDINX-NEXT:    lw a0, 0(a1)
+; RV32IZFINXZDINX-NEXT:    lw a2, 4(a1)
+; RV32IZFINXZDINX-NEXT:    lw a3, 8(a1)
+; RV32IZFINXZDINX-NEXT:    lw a1, 12(a1)
+; RV32IZFINXZDINX-NEXT:    sw a0, 0(sp)
+; RV32IZFINXZDINX-NEXT:    sw a2, 4(sp)
+; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)
+; RV32IZFINXZDINX-NEXT:    sw a1, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    addi a0, sp, 16
 ; RV32IZFINXZDINX-NEXT:    mv a1, sp
 ; RV32IZFINXZDINX-NEXT:    addi a2, sp, 36
-; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
-; RV32IZFINXZDINX-NEXT:    sw a4, 4(sp)
-; RV32IZFINXZDINX-NEXT:    sw a5, 8(sp)
-; RV32IZFINXZDINX-NEXT:    sw a6, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    call frexpl
 ; RV32IZFINXZDINX-NEXT:    lw a0, 36(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a1, 16(sp)
@@ -1653,18 +1647,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a2, 4(sp)
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a0, sp, 16
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    addi a2, sp, 36
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    call frexpl
 ; RV32I-NEXT:    lw a0, 36(sp)
 ; RV32I-NEXT:    lw a1, 16(sp)
@@ -1710,18 +1704,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -48
 ; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lw a3, 0(a1)
-; RV32IFD-NEXT:    lw a4, 4(a1)
-; RV32IFD-NEXT:    lw a5, 8(a1)
-; RV32IFD-NEXT:    lw a6, 12(a1)
 ; RV32IFD-NEXT:    mv s0, a0
+; RV32IFD-NEXT:    lw a0, 0(a1)
+; RV32IFD-NEXT:    lw a2, 4(a1)
+; RV32IFD-NEXT:    lw a3, 8(a1)
+; RV32IFD-NEXT:    lw a1, 12(a1)
+; RV32IFD-NEXT:    sw a0, 0(sp)
+; RV32IFD-NEXT:    sw a2, 4(sp)
+; RV32IFD-NEXT:    sw a3, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
 ; RV32IFD-NEXT:    addi a0, sp, 16
 ; RV32IFD-NEXT:    mv a1, sp
 ; RV32IFD-NEXT:    addi a2, sp, 36
-; RV32IFD-NEXT:    sw a3, 0(sp)
-; RV32IFD-NEXT:    sw a4, 4(sp)
-; RV32IFD-NEXT:    sw a5, 8(sp)
-; RV32IFD-NEXT:    sw a6, 12(sp)
 ; RV32IFD-NEXT:    call frexpl
 ; RV32IFD-NEXT:    lw a0, 16(sp)
 ; RV32IFD-NEXT:    lw a1, 20(sp)
@@ -1751,18 +1745,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
 ; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lw a3, 0(a1)
-; RV32IZFINXZDINX-NEXT:    lw a4, 4(a1)
-; RV32IZFINXZDINX-NEXT:    lw a5, 8(a1)
-; RV32IZFINXZDINX-NEXT:    lw a6, 12(a1)
 ; RV32IZFINXZDINX-NEXT:    mv s0, a0
+; RV32IZFINXZDINX-NEXT:    lw a0, 0(a1)
+; RV32IZFINXZDINX-NEXT:    lw a2, 4(a1)
+; RV32IZFINXZDINX-NEXT:    lw a3, 8(a1)
+; RV32IZFINXZDINX-NEXT:    lw a1, 12(a1)
+; RV32IZFINXZDINX-NEXT:    sw a0, 0(sp)
+; RV32IZFINXZDINX-NEXT:    sw a2, 4(sp)
+; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)
+; RV32IZFINXZDINX-NEXT:    sw a1, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    addi a0, sp, 16
 ; RV32IZFINXZDINX-NEXT:    mv a1, sp
 ; RV32IZFINXZDINX-NEXT:    addi a2, sp, 36
-; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
-; RV32IZFINXZDINX-NEXT:    sw a4, 4(sp)
-; RV32IZFINXZDINX-NEXT:    sw a5, 8(sp)
-; RV32IZFINXZDINX-NEXT:    sw a6, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    call frexpl
 ; RV32IZFINXZDINX-NEXT:    lw a0, 16(sp)
 ; RV32IZFINXZDINX-NEXT:    lw a1, 20(sp)
@@ -1792,18 +1786,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a6, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a2, 4(sp)
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    addi a0, sp, 16
 ; RV32I-NEXT:    mv a1, sp
 ; RV32I-NEXT:    addi a2, sp, 36
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    call frexpl
 ; RV32I-NEXT:    lw a0, 16(sp)
 ; RV32I-NEXT:    lw a1, 20(sp)
@@ -1837,17 +1831,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind {
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -48
 ; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lw a3, 0(a0)
-; RV32IFD-NEXT:    lw a4, 4(a0)
-; RV32IFD-NEXT:    lw a5, 8(a0)
-; RV32IFD-NEXT:    lw a6, 12(a0)
+; RV32IFD-NEXT:    lw a1, 0(a0)
+; RV32IFD-NEXT:    lw a2, 4(a0)
+; RV32IFD-NEXT:    lw a3, 8(a0)
+; RV32IFD-NEXT:    lw a0, 12(a0)
+; RV32IFD-NEXT:    sw a1, 8(sp)
+; RV32IFD-NEXT:    sw a2, 12(sp)
+; RV32IFD-NEXT:    sw a3, 16(sp)
+; RV32IFD-NEXT:    sw a0, 20(sp)
 ; RV32IFD-NEXT:    addi a0, sp, 24
 ; RV32IFD-NEXT:    addi a1, sp, 8
 ; RV32IFD-NEXT:    addi a2, sp, 40
-; RV32IFD-NEXT:    sw a3, 8(sp)
-; RV32IFD-NEXT:    sw a4, 12(sp)
-; RV32IFD-NEXT:    sw a5, 16(sp)
-; RV32IFD-NEXT:    sw a6, 20(sp)
 ; RV32IFD-NEXT:    call frexpl
 ; RV32IFD-NEXT:    lw a0, 40(sp)
 ; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1869,17 +1863,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind {
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
 ; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lw a3, 0(a0)
-; RV32IZFINXZDINX-NEXT:    lw a4, 4(a0)
-; RV32IZFINXZDINX-NEXT:    lw a5, 8(a0)
-; RV32IZFINXZDINX-NEXT:    lw a6, 12(a0)
+; RV32IZFINXZDINX-NEXT:    lw a1, 0(a0)
+; RV32IZFINXZDINX-NEXT:    lw a2, 4(a0)
+; RV32IZFINXZDINX-NEXT:    lw a3, 8(a0)
+; RV32IZFINXZDINX-NEXT:    lw a0, 12(a0)
+; RV32IZFINXZDINX-NEXT:    sw a1, 8(sp)
+; RV32IZFINXZDINX-NEXT:    sw a2, 12(sp)
+; RV32IZFINXZDINX-NEXT:    sw a3, 16(sp)
+; RV32IZFINXZDINX-NEXT:    sw a0, 20(sp)
 ; RV32IZFINXZDINX-NEXT:    addi a0, sp, 24
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    addi a2, sp, 40
-; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)
-; RV32IZFINXZDINX-NEXT:    sw a4, 12(sp)
-; RV32IZFINXZDINX-NEXT:    sw a5, 16(sp)
-; RV32IZFINXZDINX-NEXT:    sw a6, 20(sp)
 ; RV32IZFINXZDINX-NEXT:    call frexpl
 ; RV32IZFINXZDINX-NEXT:    lw a0, 40(sp)
 ; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1901,17 +1895,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    lw a6, 12(a0)
+; RV32I-NEXT:    lw a1, 0(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
+; RV32I-NEXT:    lw a3, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sw a1, 8(sp)
+; RV32I-NEXT:    sw a2, 12(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    addi a0, sp, 24
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    addi a2, sp, 40
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
 ; RV32I-NEXT:    call frexpl
 ; RV32I-NEXT:    lw a0, 40(sp)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll
index fa8ca071d2189..627f0005932a3 100644
--- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll
+++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll
@@ -43,8 +43,8 @@ define void @test(i32 signext %i) nounwind {
 ; RV32-NEXT:  .LBB0_2: # %bb
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    add a4, a2, a1
-; RV32-NEXT:    add a1, a1, a0
 ; RV32-NEXT:    sb zero, 0(a4)
+; RV32-NEXT:    add a1, a1, a0
 ; RV32-NEXT:    blt a1, a3, .LBB0_2
 ; RV32-NEXT:  .LBB0_3: # %return
 ; RV32-NEXT:    ret
@@ -63,8 +63,8 @@ define void @test(i32 signext %i) nounwind {
 ; RV64-NEXT:    slli a4, a1, 32
 ; RV64-NEXT:    srli a4, a4, 32
 ; RV64-NEXT:    add a4, a2, a4
-; RV64-NEXT:    addw a1, a1, a0
 ; RV64-NEXT:    sb zero, 0(a4)
+; RV64-NEXT:    addw a1, a1, a0
 ; RV64-NEXT:    blt a1, a3, .LBB0_2
 ; RV64-NEXT:  .LBB0_3: # %return
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll b/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll
index eb84774014a4b..b3777668e20bd 100644
--- a/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll
+++ b/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll
@@ -319,8 +319,8 @@ define signext i32 @branch_dispatch(i8 %a) {
 ; CHECK-NEXT:    li a1, 70
 ; CHECK-NEXT:    beq a0, a1, .LBB3_9
 ; CHECK-NEXT:  # %bb.3: # %case.3
-; CHECK-NEXT:    li a1, 234
 ; CHECK-NEXT:    li s0, 23
+; CHECK-NEXT:    li a1, 234
 ; CHECK-NEXT:    beq a0, a1, .LBB3_10
 ; CHECK-NEXT:  # %bb.4: # %case.4
 ; CHECK-NEXT:    beqz a0, .LBB3_11
diff --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
index 8deb17582cb11..ae9572328bd5d 100644
--- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
@@ -57,29 +57,29 @@ ret:
 define void @test_la(i32 signext %n) {
 ; RV32I-LABEL: test_la:
 ; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:  .Lpcrel_hi1:
-; RV32I-NEXT:    auipc a1, %got_pcrel_hi(g)
-; RV32I-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi1)(a1)
-; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    auipc a2, %got_pcrel_hi(g)
+; RV32I-NEXT:    lw a2, %pcrel_lo(.Lpcrel_hi1)(a2)
 ; RV32I-NEXT:  .LBB1_1: # %loop
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    lw zero, 0(a1)
-; RV32I-NEXT:    addi a2, a2, 1
-; RV32I-NEXT:    blt a2, a0, .LBB1_1
+; RV32I-NEXT:    lw zero, 0(a2)
+; RV32I-NEXT:    addi a1, a1, 1
+; RV32I-NEXT:    blt a1, a0, .LBB1_1
 ; RV32I-NEXT:  # %bb.2: # %ret
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_la:
 ; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:  .Lpcrel_hi1:
-; RV64I-NEXT:    auipc a1, %got_pcrel_hi(g)
-; RV64I-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi1)(a1)
-; RV64I-NEXT:    li a2, 0
+; RV64I-NEXT:    auipc a2, %got_pcrel_hi(g)
+; RV64I-NEXT:    ld a2, %pcrel_lo(.Lpcrel_hi1)(a2)
 ; RV64I-NEXT:  .LBB1_1: # %loop
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    lw zero, 0(a1)
-; RV64I-NEXT:    addiw a2, a2, 1
-; RV64I-NEXT:    blt a2, a0, .LBB1_1
+; RV64I-NEXT:    lw zero, 0(a2)
+; RV64I-NEXT:    addiw a1, a1, 1
+; RV64I-NEXT:    blt a1, a0, .LBB1_1
 ; RV64I-NEXT:  # %bb.2: # %ret
 ; RV64I-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
index d1b10af16063a..78b34452adef6 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
+++ b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
@@ -118,10 +118,9 @@ define void @test_regalloc_hint(i32 noundef signext %0, i32 noundef signext %1)
 ;
 ; FUSION-GENERIC-LABEL: test_regalloc_hint:
 ; FUSION-GENERIC:       # %bb.0:
-; FUSION-GENERIC-NEXT:    lui a2, 3014
-; FUSION-GENERIC-NEXT:    addiw a2, a2, 334
 ; FUSION-GENERIC-NEXT:    mv a0, a1
-; FUSION-GENERIC-NEXT:    mv a1, a2
+; FUSION-GENERIC-NEXT:    lui a1, 3014
+; FUSION-GENERIC-NEXT:    addiw a1, a1, 334
 ; FUSION-GENERIC-NEXT:    tail bar
   tail call void @bar(i32 noundef signext %1, i32 noundef signext 12345678)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll
index a9cb80cb66349..cbfb63785661a 100644
--- a/llvm/test/CodeGen/RISCV/mem.ll
+++ b/llvm/test/CodeGen/RISCV/mem.ll
@@ -123,8 +123,9 @@ define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    lbu a2, 2(a0)
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    lbu zero, 0(a0)
-; RV32I-NEXT:    sub a0, a2, a1
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
@@ -145,8 +146,9 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    lbu a2, 2(a0)
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    lbu zero, 0(a0)
-; RV32I-NEXT:    sub a0, a2, a1
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll
index 248964146325a..254a1f85faa00 100644
--- a/llvm/test/CodeGen/RISCV/mem64.ll
+++ b/llvm/test/CodeGen/RISCV/mem64.ll
@@ -168,8 +168,9 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    lbu a2, 2(a0)
+; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
@@ -190,8 +191,9 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    lbu a2, 2(a0)
+; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    lbu zero, 0(a0)
-; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
   ; sextload i1
   %1 = getelementptr i1, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index f9086ba9d6354..6a63e80717623 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2449,14 +2449,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
@@ -2466,14 +2466,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
@@ -2487,10 +2487,10 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
@@ -2500,14 +2500,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
@@ -2835,14 +2835,14 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
@@ -2872,10 +2872,10 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
@@ -3034,14 +3034,14 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
@@ -3077,10 +3077,10 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
@@ -4410,104 +4410,104 @@ entry:
 define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -4518,16 +4518,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a5, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a3, a4
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a3, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -4538,72 +4538,72 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 2(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a3, a3, a5
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a6, a6, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index f0290298e362a..ec83f16682296 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -3355,14 +3355,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
@@ -3372,14 +3372,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
@@ -3393,10 +3393,10 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
@@ -3406,14 +3406,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
@@ -3741,14 +3741,14 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
@@ -3778,10 +3778,10 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
@@ -3940,14 +3940,14 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
@@ -3983,10 +3983,10 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a2, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
@@ -5980,104 +5980,104 @@ entry:
 define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -6088,16 +6088,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a5, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a3, a4
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a3, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -6108,72 +6108,72 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 2(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a3, a3, a5
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a6, a6, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 8
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memmove.ll b/llvm/test/CodeGen/RISCV/memmove.ll
index 62915bd4ad99d..4795d2c6a5209 100644
--- a/llvm/test/CodeGen/RISCV/memmove.ll
+++ b/llvm/test/CodeGen/RISCV/memmove.ll
@@ -159,38 +159,38 @@ entry:
 define void @unaligned_memmove7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-LABEL: unaligned_memmove7:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lbu a2, 4(a1)
-; RV32-NEXT:    lbu a3, 5(a1)
-; RV32-NEXT:    lbu a4, 6(a1)
-; RV32-NEXT:    lbu a5, 0(a1)
-; RV32-NEXT:    lbu a6, 1(a1)
-; RV32-NEXT:    lbu a7, 2(a1)
-; RV32-NEXT:    lbu a1, 3(a1)
-; RV32-NEXT:    sb a2, 4(a0)
-; RV32-NEXT:    sb a3, 5(a0)
-; RV32-NEXT:    sb a4, 6(a0)
-; RV32-NEXT:    sb a5, 0(a0)
-; RV32-NEXT:    sb a6, 1(a0)
-; RV32-NEXT:    sb a7, 2(a0)
-; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    lbu a2, 0(a1)
+; RV32-NEXT:    lbu a3, 1(a1)
+; RV32-NEXT:    lbu a4, 2(a1)
+; RV32-NEXT:    lbu a5, 3(a1)
+; RV32-NEXT:    lbu a6, 4(a1)
+; RV32-NEXT:    lbu a7, 5(a1)
+; RV32-NEXT:    lbu a1, 6(a1)
+; RV32-NEXT:    sb a6, 4(a0)
+; RV32-NEXT:    sb a7, 5(a0)
+; RV32-NEXT:    sb a1, 6(a0)
+; RV32-NEXT:    sb a2, 0(a0)
+; RV32-NEXT:    sb a3, 1(a0)
+; RV32-NEXT:    sb a4, 2(a0)
+; RV32-NEXT:    sb a5, 3(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: unaligned_memmove7:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    lbu a2, 4(a1)
-; RV64-NEXT:    lbu a3, 5(a1)
-; RV64-NEXT:    lbu a4, 6(a1)
-; RV64-NEXT:    lbu a5, 0(a1)
-; RV64-NEXT:    lbu a6, 1(a1)
-; RV64-NEXT:    lbu a7, 2(a1)
-; RV64-NEXT:    lbu a1, 3(a1)
-; RV64-NEXT:    sb a2, 4(a0)
-; RV64-NEXT:    sb a3, 5(a0)
-; RV64-NEXT:    sb a4, 6(a0)
-; RV64-NEXT:    sb a5, 0(a0)
-; RV64-NEXT:    sb a6, 1(a0)
-; RV64-NEXT:    sb a7, 2(a0)
-; RV64-NEXT:    sb a1, 3(a0)
+; RV64-NEXT:    lbu a2, 0(a1)
+; RV64-NEXT:    lbu a3, 1(a1)
+; RV64-NEXT:    lbu a4, 2(a1)
+; RV64-NEXT:    lbu a5, 3(a1)
+; RV64-NEXT:    lbu a6, 4(a1)
+; RV64-NEXT:    lbu a7, 5(a1)
+; RV64-NEXT:    lbu a1, 6(a1)
+; RV64-NEXT:    sb a6, 4(a0)
+; RV64-NEXT:    sb a7, 5(a0)
+; RV64-NEXT:    sb a1, 6(a0)
+; RV64-NEXT:    sb a2, 0(a0)
+; RV64-NEXT:    sb a3, 1(a0)
+; RV64-NEXT:    sb a4, 2(a0)
+; RV64-NEXT:    sb a5, 3(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memmove7:
@@ -289,16 +289,16 @@ define void @unaligned_memmove15(ptr nocapture %dest, ptr %src) nounwind {
 ;
 ; RV32-FAST-LABEL: unaligned_memmove15:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lbu a2, 14(a1)
-; RV32-FAST-NEXT:    lw a3, 0(a1)
-; RV32-FAST-NEXT:    lw a4, 4(a1)
-; RV32-FAST-NEXT:    lw a5, 8(a1)
-; RV32-FAST-NEXT:    lh a1, 12(a1)
-; RV32-FAST-NEXT:    sb a2, 14(a0)
-; RV32-FAST-NEXT:    sw a3, 0(a0)
-; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a5, 8(a0)
-; RV32-FAST-NEXT:    sh a1, 12(a0)
+; RV32-FAST-NEXT:    lw a2, 0(a1)
+; RV32-FAST-NEXT:    lw a3, 4(a1)
+; RV32-FAST-NEXT:    lw a4, 8(a1)
+; RV32-FAST-NEXT:    lh a5, 12(a1)
+; RV32-FAST-NEXT:    lbu a1, 14(a1)
+; RV32-FAST-NEXT:    sb a1, 14(a0)
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    sw a3, 4(a0)
+; RV32-FAST-NEXT:    sw a4, 8(a0)
+; RV32-FAST-NEXT:    sh a5, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memmove15:
@@ -365,18 +365,18 @@ define void @unaligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
 ;
 ; RV64-FAST-LABEL: unaligned_memmove31:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lh a2, 28(a1)
-; RV64-FAST-NEXT:    lbu a3, 30(a1)
-; RV64-FAST-NEXT:    ld a4, 0(a1)
-; RV64-FAST-NEXT:    ld a5, 8(a1)
-; RV64-FAST-NEXT:    ld a6, 16(a1)
-; RV64-FAST-NEXT:    lw a1, 24(a1)
-; RV64-FAST-NEXT:    sh a2, 28(a0)
-; RV64-FAST-NEXT:    sb a3, 30(a0)
-; RV64-FAST-NEXT:    sd a4, 0(a0)
-; RV64-FAST-NEXT:    sd a5, 8(a0)
-; RV64-FAST-NEXT:    sd a6, 16(a0)
-; RV64-FAST-NEXT:    sw a1, 24(a0)
+; RV64-FAST-NEXT:    ld a2, 0(a1)
+; RV64-FAST-NEXT:    ld a3, 8(a1)
+; RV64-FAST-NEXT:    ld a4, 16(a1)
+; RV64-FAST-NEXT:    lw a5, 24(a1)
+; RV64-FAST-NEXT:    lh a6, 28(a1)
+; RV64-FAST-NEXT:    lbu a1, 30(a1)
+; RV64-FAST-NEXT:    sh a6, 28(a0)
+; RV64-FAST-NEXT:    sb a1, 30(a0)
+; RV64-FAST-NEXT:    sd a2, 0(a0)
+; RV64-FAST-NEXT:    sd a3, 8(a0)
+; RV64-FAST-NEXT:    sd a4, 16(a0)
+; RV64-FAST-NEXT:    sw a5, 24(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
@@ -579,18 +579,18 @@ define void @aligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
 ;
 ; RV64-BOTH-LABEL: aligned_memmove31:
 ; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lh a2, 28(a1)
-; RV64-BOTH-NEXT:    lbu a3, 30(a1)
-; RV64-BOTH-NEXT:    ld a4, 0(a1)
-; RV64-BOTH-NEXT:    ld a5, 8(a1)
-; RV64-BOTH-NEXT:    ld a6, 16(a1)
-; RV64-BOTH-NEXT:    lw a1, 24(a1)
-; RV64-BOTH-NEXT:    sh a2, 28(a0)
-; RV64-BOTH-NEXT:    sb a3, 30(a0)
-; RV64-BOTH-NEXT:    sd a4, 0(a0)
-; RV64-BOTH-NEXT:    sd a5, 8(a0)
-; RV64-BOTH-NEXT:    sd a6, 16(a0)
-; RV64-BOTH-NEXT:    sw a1, 24(a0)
+; RV64-BOTH-NEXT:    ld a2, 0(a1)
+; RV64-BOTH-NEXT:    ld a3, 8(a1)
+; RV64-BOTH-NEXT:    ld a4, 16(a1)
+; RV64-BOTH-NEXT:    lw a5, 24(a1)
+; RV64-BOTH-NEXT:    lh a6, 28(a1)
+; RV64-BOTH-NEXT:    lbu a1, 30(a1)
+; RV64-BOTH-NEXT:    sh a6, 28(a0)
+; RV64-BOTH-NEXT:    sb a1, 30(a0)
+; RV64-BOTH-NEXT:    sd a2, 0(a0)
+; RV64-BOTH-NEXT:    sd a3, 8(a0)
+; RV64-BOTH-NEXT:    sd a4, 16(a0)
+; RV64-BOTH-NEXT:    sw a5, 24(a0)
 ; RV64-BOTH-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll
index 35ce7fad0ea67..3b80c5684bfd0 100644
--- a/llvm/test/CodeGen/RISCV/memset-pattern.ll
+++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll
@@ -15,24 +15,24 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-LABEL: memset_1:
 ; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
 ; RV32-BOTH-NEXT:    li a2, 0
-; RV32-BOTH-NEXT:    lw a3, 0(a1)
-; RV32-BOTH-NEXT:    lw a4, 4(a1)
-; RV32-BOTH-NEXT:    lw a5, 8(a1)
+; RV32-BOTH-NEXT:    li a3, 0
+; RV32-BOTH-NEXT:    lw a4, 0(a1)
+; RV32-BOTH-NEXT:    lw a5, 4(a1)
+; RV32-BOTH-NEXT:    lw a6, 8(a1)
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
-; RV32-BOTH-NEXT:    li a6, 0
 ; RV32-BOTH-NEXT:  .LBB0_1: # %loadstoreloop
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli a7, a2, 4
 ; RV32-BOTH-NEXT:    addi a2, a2, 1
 ; RV32-BOTH-NEXT:    add a7, a0, a7
-; RV32-BOTH-NEXT:    seqz t0, a2
-; RV32-BOTH-NEXT:    add a6, a6, t0
-; RV32-BOTH-NEXT:    or t0, a2, a6
-; RV32-BOTH-NEXT:    sw a3, 0(a7)
-; RV32-BOTH-NEXT:    sw a4, 4(a7)
-; RV32-BOTH-NEXT:    sw a5, 8(a7)
+; RV32-BOTH-NEXT:    sw a4, 0(a7)
+; RV32-BOTH-NEXT:    sw a5, 4(a7)
+; RV32-BOTH-NEXT:    sw a6, 8(a7)
 ; RV32-BOTH-NEXT:    sw a1, 12(a7)
-; RV32-BOTH-NEXT:    beqz t0, .LBB0_1
+; RV32-BOTH-NEXT:    seqz a7, a2
+; RV32-BOTH-NEXT:    add a3, a3, a7
+; RV32-BOTH-NEXT:    or a7, a2, a3
+; RV32-BOTH-NEXT:    beqz a7, .LBB0_1
 ; RV32-BOTH-NEXT:  # %bb.2: # %split
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -60,19 +60,18 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    li a2, 0
 ; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a5, 4(a1)
 ; RV32-NEXT:    lw a6, 8(a1)
 ; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    srli a7, a4, 24
-; RV32-NEXT:    srli t0, a4, 16
-; RV32-NEXT:    srli t1, a4, 8
-; RV32-NEXT:    srli t2, a5, 24
-; RV32-NEXT:    srli t3, a5, 16
-; RV32-NEXT:    srli t4, a5, 8
+; RV32-NEXT:    srli a7, a5, 24
+; RV32-NEXT:    srli t0, a5, 16
+; RV32-NEXT:    srli t1, a5, 8
+; RV32-NEXT:    srli t2, a4, 24
+; RV32-NEXT:    srli t3, a4, 16
+; RV32-NEXT:    srli t4, a4, 8
 ; RV32-NEXT:    srli t5, a6, 24
 ; RV32-NEXT:    srli t6, a6, 16
 ; RV32-NEXT:    srli s0, a6, 8
@@ -84,12 +83,11 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    slli s4, a2, 4
 ; RV32-NEXT:    addi a2, a2, 1
 ; RV32-NEXT:    add s4, a0, s4
-; RV32-NEXT:    seqz s5, a2
-; RV32-NEXT:    sb a4, 4(s4)
+; RV32-NEXT:    sb a5, 4(s4)
 ; RV32-NEXT:    sb t1, 5(s4)
 ; RV32-NEXT:    sb t0, 6(s4)
 ; RV32-NEXT:    sb a7, 7(s4)
-; RV32-NEXT:    sb a5, 0(s4)
+; RV32-NEXT:    sb a4, 0(s4)
 ; RV32-NEXT:    sb t4, 1(s4)
 ; RV32-NEXT:    sb t3, 2(s4)
 ; RV32-NEXT:    sb t2, 3(s4)
@@ -97,20 +95,20 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    sb s0, 9(s4)
 ; RV32-NEXT:    sb t6, 10(s4)
 ; RV32-NEXT:    sb t5, 11(s4)
-; RV32-NEXT:    add a3, a3, s5
-; RV32-NEXT:    or s5, a2, a3
 ; RV32-NEXT:    sb a1, 12(s4)
 ; RV32-NEXT:    sb s3, 13(s4)
 ; RV32-NEXT:    sb s2, 14(s4)
 ; RV32-NEXT:    sb s1, 15(s4)
-; RV32-NEXT:    beqz s5, .LBB1_1
+; RV32-NEXT:    seqz s4, a2
+; RV32-NEXT:    add a3, a3, s4
+; RV32-NEXT:    or s4, a2, a3
+; RV32-NEXT:    beqz s4, .LBB1_1
 ; RV32-NEXT:  # %bb.2: # %split
 ; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
 ;
@@ -165,24 +163,24 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-FAST-LABEL: memset_1_noalign:
 ; RV32-FAST:       # %bb.0: # %loadstoreloop.preheader
 ; RV32-FAST-NEXT:    li a2, 0
-; RV32-FAST-NEXT:    lw a3, 0(a1)
-; RV32-FAST-NEXT:    lw a4, 4(a1)
-; RV32-FAST-NEXT:    lw a5, 8(a1)
+; RV32-FAST-NEXT:    li a3, 0
+; RV32-FAST-NEXT:    lw a4, 0(a1)
+; RV32-FAST-NEXT:    lw a5, 4(a1)
+; RV32-FAST-NEXT:    lw a6, 8(a1)
 ; RV32-FAST-NEXT:    lw a1, 12(a1)
-; RV32-FAST-NEXT:    li a6, 0
 ; RV32-FAST-NEXT:  .LBB1_1: # %loadstoreloop
 ; RV32-FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-FAST-NEXT:    slli a7, a2, 4
 ; RV32-FAST-NEXT:    addi a2, a2, 1
 ; RV32-FAST-NEXT:    add a7, a0, a7
-; RV32-FAST-NEXT:    seqz t0, a2
-; RV32-FAST-NEXT:    add a6, a6, t0
-; RV32-FAST-NEXT:    or t0, a2, a6
-; RV32-FAST-NEXT:    sw a3, 0(a7)
-; RV32-FAST-NEXT:    sw a4, 4(a7)
-; RV32-FAST-NEXT:    sw a5, 8(a7)
+; RV32-FAST-NEXT:    sw a4, 0(a7)
+; RV32-FAST-NEXT:    sw a5, 4(a7)
+; RV32-FAST-NEXT:    sw a6, 8(a7)
 ; RV32-FAST-NEXT:    sw a1, 12(a7)
-; RV32-FAST-NEXT:    beqz t0, .LBB1_1
+; RV32-FAST-NEXT:    seqz a7, a2
+; RV32-FAST-NEXT:    add a3, a3, a7
+; RV32-FAST-NEXT:    or a7, a2, a3
+; RV32-FAST-NEXT:    beqz a7, .LBB1_1
 ; RV32-FAST-NEXT:  # %bb.2: # %split
 ; RV32-FAST-NEXT:    ret
 ;
@@ -205,26 +203,26 @@ define void @memset_4(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-LABEL: memset_4:
 ; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
 ; RV32-BOTH-NEXT:    li a2, 0
-; RV32-BOTH-NEXT:    lw a3, 0(a1)
-; RV32-BOTH-NEXT:    lw a4, 4(a1)
-; RV32-BOTH-NEXT:    lw a5, 8(a1)
+; RV32-BOTH-NEXT:    li a3, 0
+; RV32-BOTH-NEXT:    lw a4, 0(a1)
+; RV32-BOTH-NEXT:    lw a5, 4(a1)
+; RV32-BOTH-NEXT:    lw a6, 8(a1)
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
-; RV32-BOTH-NEXT:    li a6, 0
 ; RV32-BOTH-NEXT:  .LBB2_1: # %loadstoreloop
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli a7, a2, 4
 ; RV32-BOTH-NEXT:    addi a2, a2, 1
-; RV32-BOTH-NEXT:    seqz t0, a2
-; RV32-BOTH-NEXT:    sltiu t1, a2, 4
-; RV32-BOTH-NEXT:    add a6, a6, t0
-; RV32-BOTH-NEXT:    seqz t0, a6
-; RV32-BOTH-NEXT:    and t0, t0, t1
 ; RV32-BOTH-NEXT:    add a7, a0, a7
-; RV32-BOTH-NEXT:    sw a3, 0(a7)
-; RV32-BOTH-NEXT:    sw a4, 4(a7)
-; RV32-BOTH-NEXT:    sw a5, 8(a7)
+; RV32-BOTH-NEXT:    seqz t0, a2
+; RV32-BOTH-NEXT:    sw a4, 0(a7)
+; RV32-BOTH-NEXT:    sw a5, 4(a7)
+; RV32-BOTH-NEXT:    sw a6, 8(a7)
 ; RV32-BOTH-NEXT:    sw a1, 12(a7)
-; RV32-BOTH-NEXT:    bnez t0, .LBB2_1
+; RV32-BOTH-NEXT:    add a3, a3, t0
+; RV32-BOTH-NEXT:    seqz a7, a3
+; RV32-BOTH-NEXT:    sltiu t0, a2, 4
+; RV32-BOTH-NEXT:    and a7, a7, t0
+; RV32-BOTH-NEXT:    bnez a7, .LBB2_1
 ; RV32-BOTH-NEXT:  # %bb.2: # %split
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -250,28 +248,28 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
 ; RV32-BOTH-NEXT:    beqz a4, .LBB3_5
 ; RV32-BOTH-NEXT:  # %bb.1: # %loadstoreloop.preheader
 ; RV32-BOTH-NEXT:    li a4, 0
-; RV32-BOTH-NEXT:    lw a5, 0(a1)
-; RV32-BOTH-NEXT:    lw a6, 4(a1)
-; RV32-BOTH-NEXT:    lw a7, 8(a1)
+; RV32-BOTH-NEXT:    li a5, 0
+; RV32-BOTH-NEXT:    lw a6, 0(a1)
+; RV32-BOTH-NEXT:    lw a7, 4(a1)
+; RV32-BOTH-NEXT:    lw t0, 8(a1)
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
-; RV32-BOTH-NEXT:    li t0, 0
 ; RV32-BOTH-NEXT:    j .LBB3_3
 ; RV32-BOTH-NEXT:  .LBB3_2: # %loadstoreloop
 ; RV32-BOTH-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32-BOTH-NEXT:    sltu t1, t0, a3
+; RV32-BOTH-NEXT:    sltu t1, a5, a3
 ; RV32-BOTH-NEXT:    beqz t1, .LBB3_5
 ; RV32-BOTH-NEXT:  .LBB3_3: # %loadstoreloop
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli t1, a4, 4
 ; RV32-BOTH-NEXT:    addi a4, a4, 1
-; RV32-BOTH-NEXT:    seqz t2, a4
-; RV32-BOTH-NEXT:    add t0, t0, t2
 ; RV32-BOTH-NEXT:    add t1, a0, t1
-; RV32-BOTH-NEXT:    sw a5, 0(t1)
-; RV32-BOTH-NEXT:    sw a6, 4(t1)
-; RV32-BOTH-NEXT:    sw a7, 8(t1)
+; RV32-BOTH-NEXT:    sw a6, 0(t1)
+; RV32-BOTH-NEXT:    sw a7, 4(t1)
+; RV32-BOTH-NEXT:    sw t0, 8(t1)
 ; RV32-BOTH-NEXT:    sw a1, 12(t1)
-; RV32-BOTH-NEXT:    bne t0, a3, .LBB3_2
+; RV32-BOTH-NEXT:    seqz t1, a4
+; RV32-BOTH-NEXT:    add a5, a5, t1
+; RV32-BOTH-NEXT:    bne a5, a3, .LBB3_2
 ; RV32-BOTH-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
 ; RV32-BOTH-NEXT:    sltu t1, a4, a2
 ; RV32-BOTH-NEXT:    bnez t1, .LBB3_3
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 548c7e1c6ea8c..39dca893bd428 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1298,34 +1298,34 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m3840:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a6, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw a5, 0(a1)
 ; RV32I-NEXT:    lw a4, 12(a1)
 ; RV32I-NEXT:    srli a1, a3, 20
-; RV32I-NEXT:    slli a6, a2, 12
+; RV32I-NEXT:    slli a5, a2, 12
 ; RV32I-NEXT:    srli a7, a3, 24
 ; RV32I-NEXT:    slli t0, a2, 8
 ; RV32I-NEXT:    srli t1, a2, 20
-; RV32I-NEXT:    or a1, a6, a1
-; RV32I-NEXT:    slli a6, a4, 12
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    slli a5, a4, 12
 ; RV32I-NEXT:    srli t2, a2, 24
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a2, t0, a7
-; RV32I-NEXT:    srli a7, a5, 20
-; RV32I-NEXT:    or a6, a6, t1
+; RV32I-NEXT:    srli a7, a6, 20
+; RV32I-NEXT:    or a5, a5, t1
 ; RV32I-NEXT:    slli t0, a3, 12
 ; RV32I-NEXT:    or t1, a4, t2
-; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    srli t2, a6, 24
 ; RV32I-NEXT:    slli t3, a3, 8
 ; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    slli a4, a5, 12
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a6, 12
+; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or t0, t3, t2
-; RV32I-NEXT:    sltu t2, a2, a1
-; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sltu a7, a5, a4
-; RV32I-NEXT:    sub a6, a6, t2
+; RV32I-NEXT:    sltu a7, a2, a1
+; RV32I-NEXT:    sub a5, t1, a5
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sltu a7, a6, a4
 ; RV32I-NEXT:    mv t1, a7
 ; RV32I-NEXT:    beq t0, a3, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1333,15 +1333,15 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-NEXT:  .LBB36_2:
 ; RV32I-NEXT:    sub a2, a2, a1
 ; RV32I-NEXT:    sub a1, t0, a3
-; RV32I-NEXT:    sub a5, a5, a4
-; RV32I-NEXT:    sltu a3, a2, t1
+; RV32I-NEXT:    sub a3, a6, a4
+; RV32I-NEXT:    sltu a4, a2, t1
 ; RV32I-NEXT:    sub a2, a2, t1
 ; RV32I-NEXT:    sub a1, a1, a7
-; RV32I-NEXT:    sub a3, a6, a3
-; RV32I-NEXT:    sw a5, 0(a0)
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m3840:
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index fe19a4fa8bbd8..a57acf5576cb7 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -162,35 +162,35 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a0, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sub a2, a0, a2
-; RV32I-NEXT:    neg a0, a2
-; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sw a0, 0(a1)
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: neg_abs32_multiuse:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    neg a2, a0
-; RV32ZBB-NEXT:    max a2, a0, a2
-; RV32ZBB-NEXT:    neg a0, a2
-; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    max a0, a0, a2
+; RV32ZBB-NEXT:    sw a0, 0(a1)
+; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: neg_abs32_multiuse:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a2, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    subw a2, a0, a2
-; RV64I-NEXT:    negw a0, a2
-; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    subw a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    negw a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: neg_abs32_multiuse:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    negw a2, a0
-; RV64ZBB-NEXT:    max a2, a0, a2
-; RV64ZBB-NEXT:    negw a0, a2
-; RV64ZBB-NEXT:    sw a2, 0(a1)
+; RV64ZBB-NEXT:    max a0, a0, a2
+; RV64ZBB-NEXT:    sw a0, 0(a1)
+; RV64ZBB-NEXT:    negw a0, a0
 ; RV64ZBB-NEXT:    ret
   %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
   store i32 %abs, ptr %y
@@ -208,14 +208,12 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    snez a3, a0
-; RV32I-NEXT:    neg a4, a1
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    neg a4, a0
 ; RV32I-NEXT:    sw a0, 0(a2)
 ; RV32I-NEXT:    sw a1, 4(a2)
-; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: neg_abs64_multiuse:
@@ -227,31 +225,29 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:  .LBB5_2:
-; RV32ZBB-NEXT:    snez a3, a0
-; RV32ZBB-NEXT:    neg a4, a1
-; RV32ZBB-NEXT:    sub a3, a4, a3
-; RV32ZBB-NEXT:    neg a4, a0
 ; RV32ZBB-NEXT:    sw a0, 0(a2)
 ; RV32ZBB-NEXT:    sw a1, 4(a2)
-; RV32ZBB-NEXT:    mv a0, a4
-; RV32ZBB-NEXT:    mv a1, a3
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a1, a1, a2
+; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: neg_abs64_multiuse:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srai a2, a0, 63
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sub a2, a0, a2
-; RV64I-NEXT:    neg a0, a2
-; RV64I-NEXT:    sd a2, 0(a1)
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: neg_abs64_multiuse:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    neg a2, a0
-; RV64ZBB-NEXT:    max a2, a0, a2
-; RV64ZBB-NEXT:    neg a0, a2
-; RV64ZBB-NEXT:    sd a2, 0(a1)
+; RV64ZBB-NEXT:    max a0, a0, a2
+; RV64ZBB-NEXT:    sd a0, 0(a1)
+; RV64ZBB-NEXT:    neg a0, a0
 ; RV64ZBB-NEXT:    ret
   %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
   store i64 %abs, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
index 5ede992e844f1..ff9d7a009fc29 100644
--- a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
+++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
@@ -233,9 +233,9 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) {
 ; RV32I-NEXT:    addi a2, a2, 514
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a2, a0, 7
-; RV32I-NEXT:    srli a3, a0, 1
-; RV32I-NEXT:    sub a0, a2, a3
-; RV32I-NEXT:    sw a3, 0(a1)
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sw a0, 0(a1)
+; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
@@ -244,8 +244,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) {
 ; RV32ZBB-NEXT:    addi a2, a2, 514
 ; RV32ZBB-NEXT:    and a0, a0, a2
 ; RV32ZBB-NEXT:    srli a2, a0, 1
-; RV32ZBB-NEXT:    orc.b a0, a0
 ; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    orc.b a0, a0
 ; RV32ZBB-NEXT:    ret
 entry:
   %and = and i32 %x, 33686018
@@ -264,8 +264,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32  %x, ptr %arr) {
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a2, a0, 7
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
@@ -274,8 +274,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32  %x, ptr %arr) {
 ; RV32ZBB-NEXT:    addi a2, a2, 514
 ; RV32ZBB-NEXT:    and a0, a0, a2
 ; RV32ZBB-NEXT:    slli a2, a0, 7
-; RV32ZBB-NEXT:    orc.b a0, a0
 ; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    orc.b a0, a0
 ; RV32ZBB-NEXT:    ret
 entry:
   %and = and i32 %x, 33686018
@@ -320,8 +320,8 @@ define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32  %x, ptr %arr){
 ; CHECK-NEXT:    addi a2, a2, 257
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    slli a2, a0, 8
-; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    ret
 entry:
   %and = and i32 %x, 16843009
@@ -338,10 +338,10 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32  %x, ptr %arr) {
 ; CHECK-NEXT:    addi a2, a2, 514
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    slli a2, a0, 7
-; CHECK-NEXT:    srli a3, a0, 1
-; CHECK-NEXT:    sub a0, a2, a3
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    sw a2, 0(a1)
-; CHECK-NEXT:    sw a3, 4(a1)
+; CHECK-NEXT:    sw a0, 4(a1)
+; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    ret
 entry:
   %and = and i32 %x, 33686018
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 5a01d43fea56b..48ba11b260bda 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -373,12 +373,12 @@ define i64 @uaddo6_xor(i64 %a, i64 %b) {
 ;
 ; RV64-LABEL: uaddo6_xor:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    not a2, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltu a2, a1, .LBB8_2
+; RV64-NEXT:    not a0, a0
+; RV64-NEXT:    bltu a0, a1, .LBB8_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 42
+; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:  .LBB8_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %x = xor i64 %a, -1
   %cmp = icmp ult i64 %x, %b
@@ -409,12 +409,12 @@ define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) {
 ;
 ; RV64-LABEL: uaddo6_xor_commuted:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    not a2, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltu a2, a1, .LBB9_2
+; RV64-NEXT:    not a0, a0
+; RV64-NEXT:    bltu a0, a1, .LBB9_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 42
+; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:  .LBB9_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %x = xor i64 %a, -1
   %cmp = icmp ult i64 %x, %b
@@ -436,8 +436,8 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) {
 ; RV32-NEXT:    .cfi_offset s0, -8
 ; RV32-NEXT:    .cfi_offset s1, -12
 ; RV32-NEXT:    mv s0, a2
-; RV32-NEXT:    not a1, a1
 ; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    not a1, a1
 ; RV32-NEXT:    beq a1, a3, .LBB10_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    sltu a2, a1, a3
@@ -472,8 +472,8 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) {
 ; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    not a0, a0
 ; RV64-NEXT:    mv s0, a1
+; RV64-NEXT:    not a0, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB10_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li s0, 42
@@ -499,17 +499,17 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) {
 define i1 @uaddo6_xor_op_after_XOR(i32 %a, ptr %b.ptr) {
 ; RV32-LABEL: uaddo6_xor_op_after_XOR:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo6_xor_op_after_XOR:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 0(a1)
 ; RV64-NEXT:    not a0, a0
 ; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    lw a1, 0(a1)
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
@@ -811,8 +811,8 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
 ; RV64-LABEL: usubo_ult_i64_math_overflow_used:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sub a3, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    sd a3, 0(a2)
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    ret
   %s = sub i64 %x, %y
   store i64 %s, ptr %p
@@ -1080,33 +1080,33 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    .cfi_offset s5, -28
 ; RV32-NEXT:    .cfi_offset s6, -32
 ; RV32-NEXT:    mv s5, a5
-; RV32-NEXT:    mv s3, a1
-; RV32-NEXT:    andi a1, a5, 1
-; RV32-NEXT:    beqz a1, .LBB32_8
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    andi a0, a5, 1
+; RV32-NEXT:    beqz a0, .LBB32_8
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    mv s0, a4
 ; RV32-NEXT:    mv s2, a3
 ; RV32-NEXT:    mv s1, a2
-; RV32-NEXT:    mv s4, a0
-; RV32-NEXT:    beq s3, a3, .LBB32_3
+; RV32-NEXT:    mv s4, a1
+; RV32-NEXT:    beq a1, a3, .LBB32_3
 ; RV32-NEXT:  # %bb.2: # %t
-; RV32-NEXT:    sltu s6, s3, s2
+; RV32-NEXT:    sltu s6, s4, s2
 ; RV32-NEXT:    j .LBB32_4
 ; RV32-NEXT:  .LBB32_3:
-; RV32-NEXT:    sltu s6, s4, s1
+; RV32-NEXT:    sltu s6, s3, s1
 ; RV32-NEXT:  .LBB32_4: # %t
 ; RV32-NEXT:    mv a0, s6
 ; RV32-NEXT:    call call
 ; RV32-NEXT:    beqz s6, .LBB32_8
 ; RV32-NEXT:  # %bb.5: # %end
-; RV32-NEXT:    sltu a1, s4, s1
+; RV32-NEXT:    sltu a1, s3, s1
 ; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beq s3, s2, .LBB32_7
+; RV32-NEXT:    beq s4, s2, .LBB32_7
 ; RV32-NEXT:  # %bb.6: # %end
-; RV32-NEXT:    sltu a0, s3, s2
+; RV32-NEXT:    sltu a0, s4, s2
 ; RV32-NEXT:  .LBB32_7: # %end
-; RV32-NEXT:    sub a2, s3, s2
-; RV32-NEXT:    sub a3, s4, s1
+; RV32-NEXT:    sub a2, s4, s2
+; RV32-NEXT:    sub a3, s3, s1
 ; RV32-NEXT:    sub a2, a2, a1
 ; RV32-NEXT:    sw a3, 0(s0)
 ; RV32-NEXT:    sw a2, 4(s0)
@@ -1151,13 +1151,13 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV64-NEXT:    .cfi_offset s3, -40
 ; RV64-NEXT:    .cfi_offset s4, -48
 ; RV64-NEXT:    mv s0, a3
-; RV64-NEXT:    mv s2, a1
-; RV64-NEXT:    andi a1, a3, 1
-; RV64-NEXT:    beqz a1, .LBB32_3
+; RV64-NEXT:    mv s3, a0
+; RV64-NEXT:    andi a0, a3, 1
+; RV64-NEXT:    beqz a0, .LBB32_3
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    mv s1, a2
-; RV64-NEXT:    mv s3, a0
-; RV64-NEXT:    sltu s4, a0, s2
+; RV64-NEXT:    mv s2, a1
+; RV64-NEXT:    sltu s4, s3, a1
 ; RV64-NEXT:    mv a0, s4
 ; RV64-NEXT:    call call
 ; RV64-NEXT:    bgeu s3, s2, .LBB32_3
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 8e858bdd29762..ccb57c442fbfa 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -13,21 +13,21 @@ define signext i32 @wobble() nounwind {
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    lui a0, %hi(global)
 ; CHECK-NEXT:    lui a1, %hi(global.1)
-; CHECK-NEXT:    lbu a0, %lo(global)(a0)
 ; CHECK-NEXT:    lui a2, %hi(global.2)
-; CHECK-NEXT:    lui a3, 52429
-; CHECK-NEXT:    lbu a2, %lo(global.2)(a2)
+; CHECK-NEXT:    lbu a0, %lo(global)(a0)
 ; CHECK-NEXT:    addi a0, a0, 1
 ; CHECK-NEXT:    sw a0, %lo(global.1)(a1)
-; CHECK-NEXT:    lui a1, %hi(global.3)
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    lui a1, 52429
+; CHECK-NEXT:    lbu a2, %lo(global.2)(a2)
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    slli a2, a0, 48
-; CHECK-NEXT:    mulhu a2, a2, a3
-; CHECK-NEXT:    srli a2, a2, 18
-; CHECK-NEXT:    li a3, 5
-; CHECK-NEXT:    sw a2, %lo(global.3)(a1)
-; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
+; CHECK-NEXT:    mulhu a1, a2, a1
+; CHECK-NEXT:    lui a2, %hi(global.3)
+; CHECK-NEXT:    srli a1, a1, 18
+; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
+; CHECK-NEXT:    li a1, 5
+; CHECK-NEXT:    bgeu a0, a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll
index e5cba679729fa..c06a5b1cf11fa 100644
--- a/llvm/test/CodeGen/RISCV/pr58511.ll
+++ b/llvm/test/CodeGen/RISCV/pr58511.ll
@@ -47,8 +47,8 @@ define i32 @h(i1 %0, i32 %1, ptr %2) {
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    ret
 BB:
   %I = select i1 %0, i32 -1, i32 0
@@ -66,8 +66,8 @@ define i32 @i(i1 %0, i32 %1, ptr %2) {
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    ret
 BB:
   %I = select i1 %0, i32 0, i32 -1
diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll
index 75ddeda3de507..5632e8ec16224 100644
--- a/llvm/test/CodeGen/RISCV/pr63816.ll
+++ b/llvm/test/CodeGen/RISCV/pr63816.ll
@@ -47,12 +47,12 @@ define void @test(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    fcvt.d.s fs6, fa0
 ; CHECK-NEXT:    fcvt.d.s fs5, fs5
 ; CHECK-NEXT:    fcvt.d.s fs4, fs4
-; CHECK-NEXT:    lhu a0, 14(s1)
 ; CHECK-NEXT:    fcvt.d.s fs3, fs3
 ; CHECK-NEXT:    fcvt.d.s fs2, fs2
 ; CHECK-NEXT:    fcvt.d.s fs1, fs1
-; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    fcvt.d.s fs0, fs0
+; CHECK-NEXT:    lhu a0, 14(s1)
+; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    fcvt.d.s fa5, fa0
 ; CHECK-NEXT:    fsd fs2, 32(s0)
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 9fc9a3c42867e..55c198aeb98b0 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -7,21 +7,21 @@
 define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-LABEL: test:
 ; NOREMAT:       # %bb.0:
-; NOREMAT-NEXT:    addi sp, sp, -752
-; NOREMAT-NEXT:    .cfi_def_cfa_offset 752
-; NOREMAT-NEXT:    sd ra, 744(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s0, 736(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s1, 728(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s2, 720(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s3, 712(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s4, 704(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s5, 696(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s6, 688(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s7, 680(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s8, 672(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s9, 664(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s10, 656(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s11, 648(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addi sp, sp, -720
+; NOREMAT-NEXT:    .cfi_def_cfa_offset 720
+; NOREMAT-NEXT:    sd ra, 712(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s0, 704(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s1, 696(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s2, 688(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s3, 680(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s4, 672(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s5, 664(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s6, 656(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s7, 648(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s8, 640(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s9, 632(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s10, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s11, 616(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    .cfi_offset ra, -8
 ; NOREMAT-NEXT:    .cfi_offset s0, -16
 ; NOREMAT-NEXT:    .cfi_offset s1, -24
@@ -35,608 +35,597 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_offset s9, -88
 ; NOREMAT-NEXT:    .cfi_offset s10, -96
 ; NOREMAT-NEXT:    .cfi_offset s11, -104
-; NOREMAT-NEXT:    csrr a2, vlenb
-; NOREMAT-NEXT:    slli a2, a2, 1
-; NOREMAT-NEXT:    sub sp, sp, a2
-; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
 ; NOREMAT-NEXT:    mv a7, a0
 ; NOREMAT-NEXT:    li a0, 32
-; NOREMAT-NEXT:    addi a5, a7, 512
+; NOREMAT-NEXT:    addi a6, a7, 512
 ; NOREMAT-NEXT:    addi a4, a7, 1024
-; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t4, 1
-; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    addi a5, a7, 1536
+; NOREMAT-NEXT:    li t0, 1
+; NOREMAT-NEXT:    li a3, 5
 ; NOREMAT-NEXT:    li t1, 3
-; NOREMAT-NEXT:    li t0, 7
-; NOREMAT-NEXT:    lui t5, 1
-; NOREMAT-NEXT:    li s4, 9
-; NOREMAT-NEXT:    li s6, 11
-; NOREMAT-NEXT:    li s9, 13
-; NOREMAT-NEXT:    li ra, 15
-; NOREMAT-NEXT:    lui t2, 2
-; NOREMAT-NEXT:    lui s1, 3
-; NOREMAT-NEXT:    lui t3, 4
-; NOREMAT-NEXT:    lui s0, 5
-; NOREMAT-NEXT:    lui s3, 6
-; NOREMAT-NEXT:    lui s7, 7
-; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t4, 11
-; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli a3, a2, 9
-; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, t0, 9
-; NOREMAT-NEXT:    add a0, a7, t5
-; NOREMAT-NEXT:    lui s11, 1
-; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a2, 10
-; NOREMAT-NEXT:    slli s6, s6, 9
-; NOREMAT-NEXT:    slli s8, t1, 11
-; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    slli s9, s9, 9
+; NOREMAT-NEXT:    li a2, 7
+; NOREMAT-NEXT:    lui t4, 1
+; NOREMAT-NEXT:    li s8, 9
+; NOREMAT-NEXT:    li s10, 11
 ; NOREMAT-NEXT:    li t5, 13
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s10, t0, 10
-; NOREMAT-NEXT:    vle32.v v0, (a6)
-; NOREMAT-NEXT:    vle32.v v12, (a6)
-; NOREMAT-NEXT:    slli ra, ra, 9
+; NOREMAT-NEXT:    lui s1, 2
+; NOREMAT-NEXT:    lui t3, 3
+; NOREMAT-NEXT:    lui s3, 4
+; NOREMAT-NEXT:    lui s11, 5
+; NOREMAT-NEXT:    lui t2, 6
+; NOREMAT-NEXT:    lui t6, 7
+; NOREMAT-NEXT:    lui s5, 8
+; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t0, t0, 11
+; NOREMAT-NEXT:    slli s0, a3, 9
+; NOREMAT-NEXT:    slli s4, t1, 10
+; NOREMAT-NEXT:    slli s6, a2, 9
+; NOREMAT-NEXT:    add a0, a7, t4
+; NOREMAT-NEXT:    slli s8, s8, 9
+; NOREMAT-NEXT:    slli s9, a3, 10
+; NOREMAT-NEXT:    vle32.v v10, (a6)
+; NOREMAT-NEXT:    slli s10, s10, 9
+; NOREMAT-NEXT:    slli ra, t1, 11
+; NOREMAT-NEXT:    vle32.v v14, (a4)
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    slli t5, t5, 9
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    vle32.v v2, (a0)
 ; NOREMAT-NEXT:    vle32.v v4, (a0)
-; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    add a5, a7, t3
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, s1
-; NOREMAT-NEXT:    vle32.v v28, (a4)
-; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    vle32.v v28, (a5)
+; NOREMAT-NEXT:    vle32.v v26, (a5)
+; NOREMAT-NEXT:    add a5, a7, s11
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
-; NOREMAT-NEXT:    vle32.v v14, (a7)
-; NOREMAT-NEXT:    vle32.v v18, (a4)
+; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v20, (a5)
+; NOREMAT-NEXT:    vle32.v v18, (a5)
+; NOREMAT-NEXT:    add a5, a7, t6
+; NOREMAT-NEXT:    vle32.v v16, (a7)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v10
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, s3
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v14
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    add a4, a7, t4
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, a3
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s7
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    add a4, a7, s5
-; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
-; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v10, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v0
+; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    add a5, a7, t0
+; NOREMAT-NEXT:    mv t3, t0
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
+; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    add a5, a7, s0
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
+; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    add a5, a7, s4
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
+; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    add a5, a7, s6
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    slli a6, a2, 10
+; NOREMAT-NEXT:    sd a6, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v0, (a5)
 ; NOREMAT-NEXT:    add a4, a7, s8
-; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
-; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v2
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    add a4, a7, s9
-; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v2
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v4
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
-; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v4
+; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
-; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a7, t4
-; NOREMAT-NEXT:    vle32.v v20, (a5)
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
+; NOREMAT-NEXT:    add a4, a7, t5
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v4
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    add a4, a7, a6
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v4
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    li a5, 15
+; NOREMAT-NEXT:    slli a4, a5, 9
+; NOREMAT-NEXT:    sd a4, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v4
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v4
 ; NOREMAT-NEXT:    li a4, 17
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li s1, 17
-; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t4, 17
+; NOREMAT-NEXT:    sd a4, 592(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
-; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
-; NOREMAT-NEXT:    li a5, 9
-; NOREMAT-NEXT:    slli a4, a5, 10
-; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v6
+; NOREMAT-NEXT:    li t1, 9
+; NOREMAT-NEXT:    slli a4, t1, 10
+; NOREMAT-NEXT:    sd a4, 584(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
 ; NOREMAT-NEXT:    li a4, 19
 ; NOREMAT-NEXT:    slli a4, a4, 9
-; NOREMAT-NEXT:    li t2, 19
-; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li s1, 19
+; NOREMAT-NEXT:    sd a4, 576(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a7, a4
-; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    slli a3, a2, 11
-; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    slli a3, a3, 11
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v6
 ; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    vle32.v v6, (a3)
+; NOREMAT-NEXT:    vle32.v v2, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v30
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
-; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    li a6, 11
-; NOREMAT-NEXT:    slli a3, a6, 10
-; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    li s3, 23
-; NOREMAT-NEXT:    slli a3, s3, 9
-; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v6
+; NOREMAT-NEXT:    li a4, 11
+; NOREMAT-NEXT:    slli a3, a4, 10
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v8, (a3)
+; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT:    li s0, 25
-; NOREMAT-NEXT:    slli a3, s0, 9
-; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v30
+; NOREMAT-NEXT:    li s2, 23
+; NOREMAT-NEXT:    slli a3, s2, 9
+; NOREMAT-NEXT:    sd a3, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v30, (a3)
+; NOREMAT-NEXT:    vle32.v v2, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v6
+; NOREMAT-NEXT:    li t6, 25
+; NOREMAT-NEXT:    slli a3, t6, 9
+; NOREMAT-NEXT:    sd a3, 536(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    slli a3, t5, 10
-; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v30
+; NOREMAT-NEXT:    li a6, 13
+; NOREMAT-NEXT:    slli a3, a6, 10
+; NOREMAT-NEXT:    sd a3, 528(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
-; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
-; NOREMAT-NEXT:    li t3, 27
-; NOREMAT-NEXT:    slli a3, t3, 9
-; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v4, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v28
+; NOREMAT-NEXT:    li t2, 27
+; NOREMAT-NEXT:    slli a3, t2, 9
+; NOREMAT-NEXT:    sd a3, 520(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    slli a2, t0, 11
-; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
+; NOREMAT-NEXT:    vle32.v v2, (a3)
+; NOREMAT-NEXT:    slli a2, a2, 11
+; NOREMAT-NEXT:    sd a2, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v6
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    li t0, 29
-; NOREMAT-NEXT:    slli a2, t0, 9
-; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    li a3, 15
-; NOREMAT-NEXT:    slli a2, a3, 10
-; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v30
+; NOREMAT-NEXT:    li a3, 29
+; NOREMAT-NEXT:    slli a2, a3, 9
+; NOREMAT-NEXT:    sd a2, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    li t1, 31
-; NOREMAT-NEXT:    slli a2, t1, 9
-; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
+; NOREMAT-NEXT:    slli a2, a5, 10
+; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t0, 15
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    addiw a0, a4, 512
-; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v26
+; NOREMAT-NEXT:    li a5, 31
+; NOREMAT-NEXT:    slli a0, a5, 9
+; NOREMAT-NEXT:    sd a0, 488(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v8, (a0)
 ; NOREMAT-NEXT:    vle32.v v26, (a0)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
-; NOREMAT-NEXT:    slli a2, s1, 10
-; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    vle32.v v2, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v30
+; NOREMAT-NEXT:    addiw a2, s3, 512
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    slli a2, a5, 11
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v28
+; NOREMAT-NEXT:    slli a2, t4, 10
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
-; NOREMAT-NEXT:    lui a5, 5
-; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
+; NOREMAT-NEXT:    addiw a2, s3, 1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    vle32.v v22, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
-; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, t1, 11
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 19
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v24
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v30
+; NOREMAT-NEXT:    addiw a2, s11, -1536
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
-; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
+; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    slli a2, s7, 10
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v26
+; NOREMAT-NEXT:    addiw a2, s11, -512
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
+; NOREMAT-NEXT:    addiw a2, s11, 512
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v22, (a2)
-; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v22
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
-; NOREMAT-NEXT:    lui a6, 6
-; NOREMAT-NEXT:    addiw a2, a6, -1536
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
+; NOREMAT-NEXT:    addiw a2, s11, 1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v18, (a2)
-; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    slli a2, s3, 10
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    slli a2, a4, 11
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v26
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v16, (a2)
-; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
-; NOREMAT-NEXT:    addiw a2, a6, -512
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v20
+; NOREMAT-NEXT:    lui a4, 6
+; NOREMAT-NEXT:    addiw a2, a4, -1536
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    addiw a2, a6, 512
+; NOREMAT-NEXT:    vle32.v v20, (a2)
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    slli a2, s2, 10
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v24
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v22, (a2)
-; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    slli a2, s0, 10
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
+; NOREMAT-NEXT:    addiw a2, a4, -512
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v26, (a2)
-; NOREMAT-NEXT:    vle32.v v2, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
-; NOREMAT-NEXT:    addiw a2, a6, 1536
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v28
+; NOREMAT-NEXT:    addiw a2, a4, 512
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, t5, 11
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    slli a2, t6, 10
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v26
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v16, (a2)
-; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
-; NOREMAT-NEXT:    lui s0, 7
-; NOREMAT-NEXT:    addiw a2, s0, -1536
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v20
+; NOREMAT-NEXT:    addiw a2, a4, 1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    slli a2, t3, 10
+; NOREMAT-NEXT:    vle32.v v20, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
-; NOREMAT-NEXT:    addiw a2, s0, -512
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
+; NOREMAT-NEXT:    lui a6, 7
+; NOREMAT-NEXT:    addiw a2, a6, -1536
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
-; NOREMAT-NEXT:    addiw a2, s0, 512
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    lui t3, 7
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v16
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    slli a2, t0, 10
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v28
+; NOREMAT-NEXT:    addiw a2, a6, -512
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v18, (a2)
-; NOREMAT-NEXT:    vle32.v v2, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
-; NOREMAT-NEXT:    addiw a2, t3, 1536
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v26
+; NOREMAT-NEXT:    addiw a2, a6, 512
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v16, (a2)
-; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    slli a2, a3, 10
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v20
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    vle32.v v6, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
-; NOREMAT-NEXT:    addiw a2, t4, -1536
+; NOREMAT-NEXT:    vle32.v v20, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
+; NOREMAT-NEXT:    addiw a2, a6, 1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a7, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    vle32.v v24, (a2)
-; NOREMAT-NEXT:    slli a2, t1, 10
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    slli a2, t0, 11
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v22
 ; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
-; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addiw a0, t4, -512
-; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v0, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v16
+; NOREMAT-NEXT:    addiw a2, s5, -1536
+; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, a5, 10
+; NOREMAT-NEXT:    sd a2, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    addiw a0, s5, -512
+; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
-; NOREMAT-NEXT:    vle32.v v12, (a0)
-; NOREMAT-NEXT:    vle32.v v0, (a0)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v18
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v16
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v14
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v20
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v10
+; NOREMAT-NEXT:    vle32.v v10, (a0)
+; NOREMAT-NEXT:    vle32.v v14, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v20
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v18
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v22
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v16
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v28
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v10
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v12
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 1
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 2
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a0, 248(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 3
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add s3, a1, s3
+; NOREMAT-NEXT:    sd s3, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 224(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a1, a4
-; NOREMAT-NEXT:    sd a4, 248(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sd a5, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 216(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a6, a1, a6
-; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add t3, a1, t3
-; NOREMAT-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a1, t4
-; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 512
-; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1024
+; NOREMAT-NEXT:    sd a6, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, s5
+; NOREMAT-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, s5, 512
 ; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a0, t4, 1536
+; NOREMAT-NEXT:    addiw a0, s5, 1024
 ; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli s1, s1, 11
-; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, s5, 1536
+; NOREMAT-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t4, t4, 11
+; NOREMAT-NEXT:    sd t4, 112(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui a0, 9
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
-; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    addiw a2, a0, -1024
 ; NOREMAT-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a2, a0, -1024
+; NOREMAT-NEXT:    sd a2, 56(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw a2, a0, -512
-; NOREMAT-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a1, a0
-; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 192(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw s11, a0, 512
 ; NOREMAT-NEXT:    addiw s7, a0, 1024
 ; NOREMAT-NEXT:    addiw s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    slli s1, s1, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addiw t2, a0, -1536
 ; NOREMAT-NEXT:    addiw a7, a0, -1024
 ; NOREMAT-NEXT:    addiw a4, a0, -512
 ; NOREMAT-NEXT:    add a2, a1, a0
-; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 184(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    ld a2, 512(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    ld a3, 504(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a3, a1, a3
-; NOREMAT-NEXT:    add a5, a1, t6
-; NOREMAT-NEXT:    add a6, a1, s2
-; NOREMAT-NEXT:    add t0, a1, s4
-; NOREMAT-NEXT:    add t1, a1, s5
-; NOREMAT-NEXT:    add t3, a1, s6
-; NOREMAT-NEXT:    add t4, a1, s8
-; NOREMAT-NEXT:    add t5, a1, s9
-; NOREMAT-NEXT:    add t6, a1, s10
-; NOREMAT-NEXT:    add s0, a1, ra
-; NOREMAT-NEXT:    ld s2, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, t3
+; NOREMAT-NEXT:    add a3, a1, s0
+; NOREMAT-NEXT:    add a5, a1, s4
+; NOREMAT-NEXT:    add a6, a1, s6
+; NOREMAT-NEXT:    add t0, a1, s8
+; NOREMAT-NEXT:    add t1, a1, s9
+; NOREMAT-NEXT:    add t3, a1, s10
+; NOREMAT-NEXT:    add t4, a1, ra
+; NOREMAT-NEXT:    add t5, a1, t5
+; NOREMAT-NEXT:    ld t6, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add t6, a1, t6
+; NOREMAT-NEXT:    ld s0, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s0, a1, s0
+; NOREMAT-NEXT:    ld s2, 592(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    ld s4, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 584(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    ld s5, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 576(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    ld s6, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    ld s8, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 560(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    ld s9, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 552(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    ld s10, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s10, 544(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    ld ra, 568(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 0(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 96(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 488(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 480(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 112(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 472(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 464(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 96(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 456(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 144(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 448(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 440(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 128(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 432(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 424(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 424(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 416(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 432(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 440(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 400(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 448(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 456(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 424(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 384(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 464(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 472(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 440(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 368(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 480(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 448(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 360(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 488(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 456(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 352(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 464(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 344(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 336(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 480(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 328(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 488(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 320(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 496(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 304(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 512(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 520(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 288(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd ra, 528(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    ld ra, 280(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 272(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 176(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 160(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 192(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 144(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 176(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 112(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 160(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 128(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 600(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add ra, a1, ra
 ; NOREMAT-NEXT:    sd ra, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    sd ra, 624(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add ra, a1, s11
 ; NOREMAT-NEXT:    add s11, a1, s7
 ; NOREMAT-NEXT:    add s7, a1, s3
@@ -657,7 +646,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a6)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (t0)
@@ -674,7 +663,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s2)
@@ -691,31 +680,37 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s10)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
@@ -724,28 +719,28 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 408(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 416(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 424(sp) # 8-byte Folded Reload
@@ -757,13 +752,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    ld a0, 440(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 448(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 456(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 448(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 456(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 464(sp) # 8-byte Folded Reload
@@ -781,13 +776,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    ld a0, 496(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 504(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 512(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 520(sp) # 8-byte Folded Reload
@@ -805,13 +800,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    ld a0, 552(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 560(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 568(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 576(sp) # 8-byte Folded Reload
@@ -829,13 +824,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    ld a0, 608(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 616(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 624(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (ra)
@@ -852,29 +841,25 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a7)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    csrr a0, vlenb
-; NOREMAT-NEXT:    slli a0, a0, 1
-; NOREMAT-NEXT:    add sp, sp, a0
-; NOREMAT-NEXT:    .cfi_def_cfa sp, 752
-; NOREMAT-NEXT:    ld ra, 744(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s0, 736(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s1, 728(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s2, 720(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s3, 712(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s4, 704(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s5, 696(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s6, 688(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s7, 680(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s8, 672(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s9, 664(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s10, 656(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s11, 648(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld ra, 712(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s0, 704(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s1, 696(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 688(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s3, 680(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 672(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 664(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 656(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s7, 648(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 640(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 632(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s10, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s11, 616(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    .cfi_restore ra
 ; NOREMAT-NEXT:    .cfi_restore s0
 ; NOREMAT-NEXT:    .cfi_restore s1
@@ -888,7 +873,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_restore s9
 ; NOREMAT-NEXT:    .cfi_restore s10
 ; NOREMAT-NEXT:    .cfi_restore s11
-; NOREMAT-NEXT:    addi sp, sp, 752
+; NOREMAT-NEXT:    addi sp, sp, 720
 ; NOREMAT-NEXT:    .cfi_def_cfa_offset 0
 ; NOREMAT-NEXT:    ret
 ;
@@ -923,10 +908,10 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 18
+; REMAT-NEXT:    li a3, 14
 ; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    sub sp, sp, a2
-; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb
 ; REMAT-NEXT:    li a4, 32
 ; REMAT-NEXT:    addi a5, a0, 512
 ; REMAT-NEXT:    addi a3, a0, 1024
@@ -976,51 +961,32 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v8, (a5)
 ; REMAT-NEXT:    li a4, 13
 ; REMAT-NEXT:    slli a4, a4, 10
-; REMAT-NEXT:    vle32.v v10, (a3)
 ; REMAT-NEXT:    vle32.v v12, (a3)
+; REMAT-NEXT:    vle32.v v14, (a3)
 ; REMAT-NEXT:    li a3, 27
 ; REMAT-NEXT:    slli a3, a3, 9
-; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    add a2, a0, a6
 ; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    add a2, a0, a6
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    add a2, a0, a7
 ; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    add a2, a0, a7
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 4
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    add a2, a0, t2
-; REMAT-NEXT:    vle32.v v4, (a0)
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a5, 14
-; REMAT-NEXT:    mul a2, a2, a5
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, t3
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
+; REMAT-NEXT:    add a2, a0, t3
+; REMAT-NEXT:    vle32.v v0, (a0)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v8
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v12
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
 ; REMAT-NEXT:    li a5, 12
@@ -1028,117 +994,112 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
-; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    add a2, a0, t4
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v16
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    add a2, a0, t5
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    add a2, a0, t6
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    add a2, a0, s0
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    add a2, a0, s1
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
 ; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
+; REMAT-NEXT:    add a2, a0, s1
 ; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v6
+; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    add a2, a0, s2
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    slli a5, a5, 4
-; REMAT-NEXT:    add a5, sp, a5
-; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v2
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v0
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v10, (a2)
 ; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 14
+; REMAT-NEXT:    li a6, 12
 ; REMAT-NEXT:    mul a5, a5, a6
 ; REMAT-NEXT:    add a5, sp, a5
 ; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vl2r.v v30, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    add a2, a0, s5
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
-; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v16
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a5, 12
+; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    add a2, a0, s6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    csrr a5, vlenb
-; REMAT-NEXT:    li a6, 12
-; REMAT-NEXT:    mul a5, a5, a6
-; REMAT-NEXT:    add a5, sp, a5
-; REMAT-NEXT:    addi a5, a5, 432
-; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
 ; REMAT-NEXT:    add a2, a0, s8
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    add a2, a0, s9
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    add a2, a0, s10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v12, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, s10
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v2
+; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    add a2, a0, s11
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
-; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v10
+; REMAT-NEXT:    vle32.v v10, (a2)
 ; REMAT-NEXT:    add a2, a0, ra
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 1
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    csrr a4, vlenb
+; REMAT-NEXT:    li a5, 12
+; REMAT-NEXT:    mul a4, a4, a5
+; REMAT-NEXT:    add a4, sp, a4
+; REMAT-NEXT:    addi a4, a4, 432
+; REMAT-NEXT:    vl2r.v v30, (a4) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 2
-; REMAT-NEXT:    add a2, sp, a2
-; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    addi a2, sp, 432
 ; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    add a2, a0, a3
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
-; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v16
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    li a3, 12
+; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
-; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a5, 7
 ; REMAT-NEXT:    slli a5, a5, 11
 ; REMAT-NEXT:    add a2, a0, a5
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    addi a3, sp, 432
-; REMAT-NEXT:    vs2r.v v18, (a3) # Unknown-size Folded Spill
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
 ; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 14
+; REMAT-NEXT:    li a3, 10
 ; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
@@ -1150,8 +1111,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v24
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 12
-; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    slli a2, a2, 3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
@@ -1159,10 +1119,10 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v6
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 10
+; REMAT-NEXT:    li a3, 6
 ; REMAT-NEXT:    mul a2, a2, a3
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
@@ -1171,26 +1131,20 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v4
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    slli a2, a2, 2
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v2
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    csrr a2, vlenb
-; REMAT-NEXT:    li a3, 6
-; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    slli a2, a2, 1
 ; REMAT-NEXT:    add a2, sp, a2
 ; REMAT-NEXT:    addi a2, a2, 432
 ; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
@@ -1198,21 +1152,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 1
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
-; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 2
-; REMAT-NEXT:    add a3, sp, a3
-; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    addi a3, sp, 432
 ; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; REMAT-NEXT:    vle32.v v22, (a2)
@@ -1221,20 +1167,19 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 4
+; REMAT-NEXT:    li a4, 12
+; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    addi a3, sp, 432
-; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v16
 ; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    li a2, 9
 ; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 14
+; REMAT-NEXT:    li a4, 10
 ; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
@@ -1246,8 +1191,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 12
-; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    slli a3, a3, 3
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
@@ -1258,7 +1202,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 10
+; REMAT-NEXT:    li a4, 6
 ; REMAT-NEXT:    mul a3, a3, a4
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
@@ -1270,7 +1214,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    slli a3, a3, 2
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
@@ -1280,8 +1224,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    csrr a3, vlenb
-; REMAT-NEXT:    li a4, 6
-; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    slli a3, a3, 1
 ; REMAT-NEXT:    add a3, sp, a3
 ; REMAT-NEXT:    addi a3, a3, 432
 ; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
@@ -1293,15 +1236,15 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li s7, 21
-; REMAT-NEXT:    slli s7, s7, 10
-; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    li a2, 21
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v0, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    lui s4, 5
-; REMAT-NEXT:    addiw s4, s4, 1536
-; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 1536
+; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
@@ -1489,18 +1432,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    lui a0, 2
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 312(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    li a0, 5
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sd a0, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s7, a1, s7
+; REMAT-NEXT:    sd s7, 288(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s8, a1, s8
 ; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    add s9, a1, s9
@@ -1571,10 +1510,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s7, a1, s7
-; REMAT-NEXT:    sd s7, 112(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    add s4, a1, s4
-; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, 1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
@@ -1879,7 +1822,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    csrr a0, vlenb
-; REMAT-NEXT:    li a1, 18
+; REMAT-NEXT:    li a1, 14
 ; REMAT-NEXT:    mul a0, a0, a1
 ; REMAT-NEXT:    add sp, sp, a0
 ; REMAT-NEXT:    .cfi_def_cfa sp, 544
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 1fbdaa76dfb68..5ce5849af700c 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -1174,16 +1174,16 @@ define i32 @varargs(ptr %fmt, ...) {
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    addi sp, sp, -80
 ; RV64IZCMP-NEXT:    .cfi_def_cfa_offset 80
-; RV64IZCMP-NEXT:    sd a1, 24(sp)
-; RV64IZCMP-NEXT:    addi a0, sp, 28
-; RV64IZCMP-NEXT:    sd a0, 8(sp)
-; RV64IZCMP-NEXT:    lw a0, 24(sp)
 ; RV64IZCMP-NEXT:    sd a5, 56(sp)
 ; RV64IZCMP-NEXT:    sd a6, 64(sp)
 ; RV64IZCMP-NEXT:    sd a7, 72(sp)
+; RV64IZCMP-NEXT:    sd a1, 24(sp)
 ; RV64IZCMP-NEXT:    sd a2, 32(sp)
 ; RV64IZCMP-NEXT:    sd a3, 40(sp)
 ; RV64IZCMP-NEXT:    sd a4, 48(sp)
+; RV64IZCMP-NEXT:    addi a0, sp, 28
+; RV64IZCMP-NEXT:    sd a0, 8(sp)
+; RV64IZCMP-NEXT:    lw a0, 24(sp)
 ; RV64IZCMP-NEXT:    addi sp, sp, 80
 ; RV64IZCMP-NEXT:    .cfi_def_cfa_offset 0
 ; RV64IZCMP-NEXT:    ret
@@ -1210,16 +1210,16 @@ define i32 @varargs(ptr %fmt, ...) {
 ; RV64IZCMP-SR:       # %bb.0:
 ; RV64IZCMP-SR-NEXT:    addi sp, sp, -80
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 80
-; RV64IZCMP-SR-NEXT:    sd a1, 24(sp)
-; RV64IZCMP-SR-NEXT:    addi a0, sp, 28
-; RV64IZCMP-SR-NEXT:    sd a0, 8(sp)
-; RV64IZCMP-SR-NEXT:    lw a0, 24(sp)
 ; RV64IZCMP-SR-NEXT:    sd a5, 56(sp)
 ; RV64IZCMP-SR-NEXT:    sd a6, 64(sp)
 ; RV64IZCMP-SR-NEXT:    sd a7, 72(sp)
+; RV64IZCMP-SR-NEXT:    sd a1, 24(sp)
 ; RV64IZCMP-SR-NEXT:    sd a2, 32(sp)
 ; RV64IZCMP-SR-NEXT:    sd a3, 40(sp)
 ; RV64IZCMP-SR-NEXT:    sd a4, 48(sp)
+; RV64IZCMP-SR-NEXT:    addi a0, sp, 28
+; RV64IZCMP-SR-NEXT:    sd a0, 8(sp)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(sp)
 ; RV64IZCMP-SR-NEXT:    addi sp, sp, 80
 ; RV64IZCMP-SR-NEXT:    .cfi_def_cfa_offset 0
 ; RV64IZCMP-SR-NEXT:    ret
@@ -1246,16 +1246,16 @@ define i32 @varargs(ptr %fmt, ...) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -80
 ; RV64I-NEXT:    .cfi_def_cfa_offset 80
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    addi a0, sp, 28
-; RV64I-NEXT:    sd a0, 8(sp)
-; RV64I-NEXT:    lw a0, 24(sp)
 ; RV64I-NEXT:    sd a5, 56(sp)
 ; RV64I-NEXT:    sd a6, 64(sp)
 ; RV64I-NEXT:    sd a7, 72(sp)
+; RV64I-NEXT:    sd a1, 24(sp)
 ; RV64I-NEXT:    sd a2, 32(sp)
 ; RV64I-NEXT:    sd a3, 40(sp)
 ; RV64I-NEXT:    sd a4, 48(sp)
+; RV64I-NEXT:    addi a0, sp, 28
+; RV64I-NEXT:    sd a0, 8(sp)
+; RV64I-NEXT:    lw a0, 24(sp)
 ; RV64I-NEXT:    addi sp, sp, 80
 ; RV64I-NEXT:    .cfi_def_cfa_offset 0
 ; RV64I-NEXT:    ret
@@ -1291,26 +1291,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV32IZCMP-NEXT:    lw t3, 20(a5)
 ; RV32IZCMP-NEXT:    lw t4, 24(a5)
 ; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 48(a5)
-; RV32IZCMP-NEXT:    lw s2, 52(a5)
-; RV32IZCMP-NEXT:    lw a3, 56(a5)
-; RV32IZCMP-NEXT:    lw a4, 60(a5)
-; RV32IZCMP-NEXT:    lw a1, 64(a5)
-; RV32IZCMP-NEXT:    lw s0, 68(a5)
-; RV32IZCMP-NEXT:    lw s3, 32(a5)
-; RV32IZCMP-NEXT:    lw s4, 36(a5)
-; RV32IZCMP-NEXT:    lw s1, 40(a5)
-; RV32IZCMP-NEXT:    lw a2, 44(a5)
-; RV32IZCMP-NEXT:    sw s0, 68(a5)
-; RV32IZCMP-NEXT:    sw a1, 64(a5)
-; RV32IZCMP-NEXT:    sw a4, 60(a5)
-; RV32IZCMP-NEXT:    sw a3, 56(a5)
-; RV32IZCMP-NEXT:    sw s2, 52(a5)
-; RV32IZCMP-NEXT:    sw t6, 48(a5)
-; RV32IZCMP-NEXT:    sw a2, 44(a5)
-; RV32IZCMP-NEXT:    sw s1, 40(a5)
-; RV32IZCMP-NEXT:    sw s4, 36(a5)
-; RV32IZCMP-NEXT:    sw s3, 32(a5)
+; RV32IZCMP-NEXT:    lw t6, 32(a5)
+; RV32IZCMP-NEXT:    lw s2, 36(a5)
+; RV32IZCMP-NEXT:    lw s3, 40(a5)
+; RV32IZCMP-NEXT:    lw s4, 44(a5)
+; RV32IZCMP-NEXT:    lw a1, 48(a5)
+; RV32IZCMP-NEXT:    lw s0, 52(a5)
+; RV32IZCMP-NEXT:    lw s1, 56(a5)
+; RV32IZCMP-NEXT:    lw a2, 60(a5)
+; RV32IZCMP-NEXT:    lw a3, 64(a5)
+; RV32IZCMP-NEXT:    lw a4, 68(a5)
+; RV32IZCMP-NEXT:    sw a4, 68(a5)
+; RV32IZCMP-NEXT:    sw a3, 64(a5)
+; RV32IZCMP-NEXT:    sw a2, 60(a5)
+; RV32IZCMP-NEXT:    sw s1, 56(a5)
+; RV32IZCMP-NEXT:    sw s0, 52(a5)
+; RV32IZCMP-NEXT:    sw a1, 48(a5)
+; RV32IZCMP-NEXT:    sw s4, 44(a5)
+; RV32IZCMP-NEXT:    sw s3, 40(a5)
+; RV32IZCMP-NEXT:    sw s2, 36(a5)
+; RV32IZCMP-NEXT:    sw t6, 32(a5)
 ; RV32IZCMP-NEXT:    sw t5, 28(a5)
 ; RV32IZCMP-NEXT:    sw t4, 24(a5)
 ; RV32IZCMP-NEXT:    sw t3, 20(a5)
@@ -1340,26 +1340,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV64IZCMP-NEXT:    lw t3, 20(a5)
 ; RV64IZCMP-NEXT:    lw t4, 24(a5)
 ; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 48(a5)
-; RV64IZCMP-NEXT:    lw s2, 52(a5)
-; RV64IZCMP-NEXT:    lw a3, 56(a5)
-; RV64IZCMP-NEXT:    lw a4, 60(a5)
-; RV64IZCMP-NEXT:    lw a1, 64(a5)
-; RV64IZCMP-NEXT:    lw s0, 68(a5)
-; RV64IZCMP-NEXT:    lw s3, 32(a5)
-; RV64IZCMP-NEXT:    lw s4, 36(a5)
-; RV64IZCMP-NEXT:    lw s1, 40(a5)
-; RV64IZCMP-NEXT:    lw a2, 44(a5)
-; RV64IZCMP-NEXT:    sw s0, 68(a5)
-; RV64IZCMP-NEXT:    sw a1, 64(a5)
-; RV64IZCMP-NEXT:    sw a4, 60(a5)
-; RV64IZCMP-NEXT:    sw a3, 56(a5)
-; RV64IZCMP-NEXT:    sw s2, 52(a5)
-; RV64IZCMP-NEXT:    sw t6, 48(a5)
-; RV64IZCMP-NEXT:    sw a2, 44(a5)
-; RV64IZCMP-NEXT:    sw s1, 40(a5)
-; RV64IZCMP-NEXT:    sw s4, 36(a5)
-; RV64IZCMP-NEXT:    sw s3, 32(a5)
+; RV64IZCMP-NEXT:    lw t6, 32(a5)
+; RV64IZCMP-NEXT:    lw s2, 36(a5)
+; RV64IZCMP-NEXT:    lw s3, 40(a5)
+; RV64IZCMP-NEXT:    lw s4, 44(a5)
+; RV64IZCMP-NEXT:    lw a1, 48(a5)
+; RV64IZCMP-NEXT:    lw s0, 52(a5)
+; RV64IZCMP-NEXT:    lw s1, 56(a5)
+; RV64IZCMP-NEXT:    lw a2, 60(a5)
+; RV64IZCMP-NEXT:    lw a3, 64(a5)
+; RV64IZCMP-NEXT:    lw a4, 68(a5)
+; RV64IZCMP-NEXT:    sw a4, 68(a5)
+; RV64IZCMP-NEXT:    sw a3, 64(a5)
+; RV64IZCMP-NEXT:    sw a2, 60(a5)
+; RV64IZCMP-NEXT:    sw s1, 56(a5)
+; RV64IZCMP-NEXT:    sw s0, 52(a5)
+; RV64IZCMP-NEXT:    sw a1, 48(a5)
+; RV64IZCMP-NEXT:    sw s4, 44(a5)
+; RV64IZCMP-NEXT:    sw s3, 40(a5)
+; RV64IZCMP-NEXT:    sw s2, 36(a5)
+; RV64IZCMP-NEXT:    sw t6, 32(a5)
 ; RV64IZCMP-NEXT:    sw t5, 28(a5)
 ; RV64IZCMP-NEXT:    sw t4, 24(a5)
 ; RV64IZCMP-NEXT:    sw t3, 20(a5)
@@ -1389,26 +1389,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV32IZCMP-SR-NEXT:    lw t3, 20(a5)
 ; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
 ; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 32(a5)
+; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
+; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
+; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
+; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 48(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 52(a5)
+; RV32IZCMP-SR-NEXT:    lw s1, 56(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 60(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 64(a5)
+; RV32IZCMP-SR-NEXT:    lw a4, 68(a5)
+; RV32IZCMP-SR-NEXT:    sw a4, 68(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 64(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 60(a5)
+; RV32IZCMP-SR-NEXT:    sw s1, 56(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 52(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 48(a5)
+; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
+; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
+; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
+; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
 ; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
 ; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
 ; RV32IZCMP-SR-NEXT:    sw t3, 20(a5)
@@ -1438,26 +1438,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV64IZCMP-SR-NEXT:    lw t3, 20(a5)
 ; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
 ; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 32(a5)
+; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
+; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
+; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
+; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 48(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 52(a5)
+; RV64IZCMP-SR-NEXT:    lw s1, 56(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 60(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 64(a5)
+; RV64IZCMP-SR-NEXT:    lw a4, 68(a5)
+; RV64IZCMP-SR-NEXT:    sw a4, 68(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 64(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 60(a5)
+; RV64IZCMP-SR-NEXT:    sw s1, 56(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 52(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 48(a5)
+; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
+; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
+; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
+; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
 ; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
 ; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
 ; RV64IZCMP-SR-NEXT:    sw t3, 20(a5)
@@ -1492,26 +1492,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV32I-NEXT:    lw a7, 20(a5)
 ; RV32I-NEXT:    lw t0, 24(a5)
 ; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 48(a5)
-; RV32I-NEXT:    lw t3, 52(a5)
-; RV32I-NEXT:    lw t4, 56(a5)
-; RV32I-NEXT:    lw t5, 60(a5)
-; RV32I-NEXT:    lw t6, 64(a5)
-; RV32I-NEXT:    lw s0, 68(a5)
-; RV32I-NEXT:    lw s1, 32(a5)
-; RV32I-NEXT:    lw s2, 36(a5)
-; RV32I-NEXT:    lw s3, 40(a5)
-; RV32I-NEXT:    lw s4, 44(a5)
-; RV32I-NEXT:    sw s0, 68(a5)
-; RV32I-NEXT:    sw t6, 64(a5)
-; RV32I-NEXT:    sw t5, 60(a5)
-; RV32I-NEXT:    sw t4, 56(a5)
-; RV32I-NEXT:    sw t3, 52(a5)
-; RV32I-NEXT:    sw t2, 48(a5)
-; RV32I-NEXT:    sw s4, 44(a5)
-; RV32I-NEXT:    sw s3, 40(a5)
-; RV32I-NEXT:    sw s2, 36(a5)
-; RV32I-NEXT:    sw s1, 32(a5)
+; RV32I-NEXT:    lw t2, 32(a5)
+; RV32I-NEXT:    lw t3, 36(a5)
+; RV32I-NEXT:    lw t4, 40(a5)
+; RV32I-NEXT:    lw t5, 44(a5)
+; RV32I-NEXT:    lw t6, 48(a5)
+; RV32I-NEXT:    lw s0, 52(a5)
+; RV32I-NEXT:    lw s1, 56(a5)
+; RV32I-NEXT:    lw s2, 60(a5)
+; RV32I-NEXT:    lw s3, 64(a5)
+; RV32I-NEXT:    lw s4, 68(a5)
+; RV32I-NEXT:    sw s4, 68(a5)
+; RV32I-NEXT:    sw s3, 64(a5)
+; RV32I-NEXT:    sw s2, 60(a5)
+; RV32I-NEXT:    sw s1, 56(a5)
+; RV32I-NEXT:    sw s0, 52(a5)
+; RV32I-NEXT:    sw t6, 48(a5)
+; RV32I-NEXT:    sw t5, 44(a5)
+; RV32I-NEXT:    sw t4, 40(a5)
+; RV32I-NEXT:    sw t3, 36(a5)
+; RV32I-NEXT:    sw t2, 32(a5)
 ; RV32I-NEXT:    sw t1, 28(a5)
 ; RV32I-NEXT:    sw t0, 24(a5)
 ; RV32I-NEXT:    sw a7, 20(a5)
@@ -1558,26 +1558,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) {
 ; RV64I-NEXT:    lw a7, 20(a5)
 ; RV64I-NEXT:    lw t0, 24(a5)
 ; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 48(a5)
-; RV64I-NEXT:    lw t3, 52(a5)
-; RV64I-NEXT:    lw t4, 56(a5)
-; RV64I-NEXT:    lw t5, 60(a5)
-; RV64I-NEXT:    lw t6, 64(a5)
-; RV64I-NEXT:    lw s0, 68(a5)
-; RV64I-NEXT:    lw s1, 32(a5)
-; RV64I-NEXT:    lw s2, 36(a5)
-; RV64I-NEXT:    lw s3, 40(a5)
-; RV64I-NEXT:    lw s4, 44(a5)
-; RV64I-NEXT:    sw s0, 68(a5)
-; RV64I-NEXT:    sw t6, 64(a5)
-; RV64I-NEXT:    sw t5, 60(a5)
-; RV64I-NEXT:    sw t4, 56(a5)
-; RV64I-NEXT:    sw t3, 52(a5)
-; RV64I-NEXT:    sw t2, 48(a5)
-; RV64I-NEXT:    sw s4, 44(a5)
-; RV64I-NEXT:    sw s3, 40(a5)
-; RV64I-NEXT:    sw s2, 36(a5)
-; RV64I-NEXT:    sw s1, 32(a5)
+; RV64I-NEXT:    lw t2, 32(a5)
+; RV64I-NEXT:    lw t3, 36(a5)
+; RV64I-NEXT:    lw t4, 40(a5)
+; RV64I-NEXT:    lw t5, 44(a5)
+; RV64I-NEXT:    lw t6, 48(a5)
+; RV64I-NEXT:    lw s0, 52(a5)
+; RV64I-NEXT:    lw s1, 56(a5)
+; RV64I-NEXT:    lw s2, 60(a5)
+; RV64I-NEXT:    lw s3, 64(a5)
+; RV64I-NEXT:    lw s4, 68(a5)
+; RV64I-NEXT:    sw s4, 68(a5)
+; RV64I-NEXT:    sw s3, 64(a5)
+; RV64I-NEXT:    sw s2, 60(a5)
+; RV64I-NEXT:    sw s1, 56(a5)
+; RV64I-NEXT:    sw s0, 52(a5)
+; RV64I-NEXT:    sw t6, 48(a5)
+; RV64I-NEXT:    sw t5, 44(a5)
+; RV64I-NEXT:    sw t4, 40(a5)
+; RV64I-NEXT:    sw t3, 36(a5)
+; RV64I-NEXT:    sw t2, 32(a5)
 ; RV64I-NEXT:    sw t1, 28(a5)
 ; RV64I-NEXT:    sw t0, 24(a5)
 ; RV64I-NEXT:    sw a7, 20(a5)
@@ -2323,16 +2323,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-NEXT:    .cfi_offset t4, -104
 ; RV32IZCMP-NEXT:    .cfi_offset t5, -108
 ; RV32IZCMP-NEXT:    .cfi_offset t6, -112
-; RV32IZCMP-NEXT:    lui t0, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-NEXT:    lui a4, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -2352,28 +2352,28 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-NEXT:    lw ra, 76(a5)
 ; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw a7, 112(a5)
-; RV32IZCMP-NEXT:    lw s0, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a6, 96(a5)
-; RV32IZCMP-NEXT:    lw a4, 100(a5)
-; RV32IZCMP-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-NEXT:    sw s0, 116(a5)
-; RV32IZCMP-NEXT:    sw a7, 112(a5)
-; RV32IZCMP-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-NEXT:    sw a4, 100(a5)
-; RV32IZCMP-NEXT:    sw a6, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
+; RV32IZCMP-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-NEXT:    lw t1, 88(a5)
+; RV32IZCMP-NEXT:    lw t0, 92(a5)
+; RV32IZCMP-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-NEXT:    lw s0, 100(a5)
+; RV32IZCMP-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-NEXT:    lw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-NEXT:    sw s0, 100(a5)
+; RV32IZCMP-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-NEXT:    sw t0, 92(a5)
+; RV32IZCMP-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-NEXT:    sw t2, 84(a5)
 ; RV32IZCMP-NEXT:    sw s1, 80(a5)
 ; RV32IZCMP-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-NEXT:    sw s11, 72(a5)
@@ -2394,13 +2394,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-NEXT:    lw t0, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t1, 84(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t2, 80(sp) # 4-byte Folded Reload
@@ -2499,16 +2499,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-NEXT:    .cfi_offset t4, -208
 ; RV64IZCMP-NEXT:    .cfi_offset t5, -216
 ; RV64IZCMP-NEXT:    .cfi_offset t6, -224
-; RV64IZCMP-NEXT:    lui t0, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-NEXT:    lui a4, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -2528,28 +2528,28 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-NEXT:    lw ra, 76(a5)
 ; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw a7, 112(a5)
-; RV64IZCMP-NEXT:    lw s0, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a6, 96(a5)
-; RV64IZCMP-NEXT:    lw a4, 100(a5)
-; RV64IZCMP-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-NEXT:    sw s0, 116(a5)
-; RV64IZCMP-NEXT:    sw a7, 112(a5)
-; RV64IZCMP-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-NEXT:    sw a4, 100(a5)
-; RV64IZCMP-NEXT:    sw a6, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
+; RV64IZCMP-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-NEXT:    lw t1, 88(a5)
+; RV64IZCMP-NEXT:    lw t0, 92(a5)
+; RV64IZCMP-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-NEXT:    lw s0, 100(a5)
+; RV64IZCMP-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-NEXT:    lw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-NEXT:    sw s0, 100(a5)
+; RV64IZCMP-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-NEXT:    sw t0, 92(a5)
+; RV64IZCMP-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-NEXT:    sw t2, 84(a5)
 ; RV64IZCMP-NEXT:    sw s1, 80(a5)
 ; RV64IZCMP-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-NEXT:    sw s11, 72(a5)
@@ -2570,13 +2570,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-NEXT:    ld t0, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t1, 152(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t2, 144(sp) # 8-byte Folded Reload
@@ -2675,16 +2675,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    .cfi_offset t4, -104
 ; RV32IZCMP-SR-NEXT:    .cfi_offset t5, -108
 ; RV32IZCMP-SR-NEXT:    .cfi_offset t6, -112
-; RV32IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-SR-NEXT:    lui a4, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2704,28 +2704,28 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
 ; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 112(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a6, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw a6, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, 88(a5)
+; RV32IZCMP-SR-NEXT:    lw t0, 92(a5)
+; RV32IZCMP-SR-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-SR-NEXT:    lw t3, 124(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, 124(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
+; RV32IZCMP-SR-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-SR-NEXT:    sw t0, 92(a5)
+; RV32IZCMP-SR-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, 84(a5)
 ; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
 ; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
@@ -2746,13 +2746,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-SR-NEXT:    lw t0, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t1, 84(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t2, 80(sp) # 4-byte Folded Reload
@@ -2851,16 +2851,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    .cfi_offset t4, -208
 ; RV64IZCMP-SR-NEXT:    .cfi_offset t5, -216
 ; RV64IZCMP-SR-NEXT:    .cfi_offset t6, -224
-; RV64IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-SR-NEXT:    lui a4, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -2880,28 +2880,28 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
 ; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 112(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a6, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw a6, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, 88(a5)
+; RV64IZCMP-SR-NEXT:    lw t0, 92(a5)
+; RV64IZCMP-SR-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-SR-NEXT:    lw t3, 124(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, 124(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
+; RV64IZCMP-SR-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-SR-NEXT:    sw t0, 92(a5)
+; RV64IZCMP-SR-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, 84(a5)
 ; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
 ; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
@@ -2922,13 +2922,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-SR-NEXT:    ld t0, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t1, 152(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t2, 144(sp) # 8-byte Folded Reload
@@ -3038,16 +3038,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32I-NEXT:    .cfi_offset t4, -104
 ; RV32I-NEXT:    .cfi_offset t5, -108
 ; RV32I-NEXT:    .cfi_offset t6, -112
-; RV32I-NEXT:    lui a7, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a7)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a7, %lo(var_test_irq)
+; RV32I-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -3070,22 +3070,22 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 112(a5)
-; RV32I-NEXT:    lw ra, 116(a5)
-; RV32I-NEXT:    lw a3, 120(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a6, 96(a5)
-; RV32I-NEXT:    lw a4, 100(a5)
-; RV32I-NEXT:    lw a2, 104(a5)
-; RV32I-NEXT:    lw a1, 108(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a3, 120(a5)
-; RV32I-NEXT:    sw ra, 116(a5)
-; RV32I-NEXT:    sw s11, 112(a5)
-; RV32I-NEXT:    sw a1, 108(a5)
-; RV32I-NEXT:    sw a2, 104(a5)
-; RV32I-NEXT:    sw a4, 100(a5)
-; RV32I-NEXT:    sw a6, 96(a5)
+; RV32I-NEXT:    lw s11, 96(a5)
+; RV32I-NEXT:    lw ra, 100(a5)
+; RV32I-NEXT:    lw a6, 104(a5)
+; RV32I-NEXT:    lw a3, 108(a5)
+; RV32I-NEXT:    lw a2, 112(a5)
+; RV32I-NEXT:    lw a1, 116(a5)
+; RV32I-NEXT:    lw a0, 120(a5)
+; RV32I-NEXT:    lw a7, 124(a5)
+; RV32I-NEXT:    sw a7, 124(a5)
+; RV32I-NEXT:    sw a0, 120(a5)
+; RV32I-NEXT:    sw a1, 116(a5)
+; RV32I-NEXT:    sw a2, 112(a5)
+; RV32I-NEXT:    sw a3, 108(a5)
+; RV32I-NEXT:    sw a6, 104(a5)
+; RV32I-NEXT:    sw ra, 100(a5)
+; RV32I-NEXT:    sw s11, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -3109,13 +3109,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t1, 132(sp) # 4-byte Folded Reload
@@ -3236,16 +3236,16 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64I-NEXT:    .cfi_offset t4, -208
 ; RV64I-NEXT:    .cfi_offset t5, -216
 ; RV64I-NEXT:    .cfi_offset t6, -224
-; RV64I-NEXT:    lui a7, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a7)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a7, %lo(var_test_irq)
+; RV64I-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -3268,22 +3268,22 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 112(a5)
-; RV64I-NEXT:    lw ra, 116(a5)
-; RV64I-NEXT:    lw a3, 120(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a6, 96(a5)
-; RV64I-NEXT:    lw a4, 100(a5)
-; RV64I-NEXT:    lw a2, 104(a5)
-; RV64I-NEXT:    lw a1, 108(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a3, 120(a5)
-; RV64I-NEXT:    sw ra, 116(a5)
-; RV64I-NEXT:    sw s11, 112(a5)
-; RV64I-NEXT:    sw a1, 108(a5)
-; RV64I-NEXT:    sw a2, 104(a5)
-; RV64I-NEXT:    sw a4, 100(a5)
-; RV64I-NEXT:    sw a6, 96(a5)
+; RV64I-NEXT:    lw s11, 96(a5)
+; RV64I-NEXT:    lw ra, 100(a5)
+; RV64I-NEXT:    lw a6, 104(a5)
+; RV64I-NEXT:    lw a3, 108(a5)
+; RV64I-NEXT:    lw a2, 112(a5)
+; RV64I-NEXT:    lw a1, 116(a5)
+; RV64I-NEXT:    lw a0, 120(a5)
+; RV64I-NEXT:    lw a7, 124(a5)
+; RV64I-NEXT:    sw a7, 124(a5)
+; RV64I-NEXT:    sw a0, 120(a5)
+; RV64I-NEXT:    sw a1, 116(a5)
+; RV64I-NEXT:    sw a2, 112(a5)
+; RV64I-NEXT:    sw a3, 108(a5)
+; RV64I-NEXT:    sw a6, 104(a5)
+; RV64I-NEXT:    sw ra, 100(a5)
+; RV64I-NEXT:    sw s11, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -3307,13 +3307,13 @@ define void @callee_with_irq() "interrupt"="user" {
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t0, 256(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t1, 248(sp) # 8-byte Folded Reload
@@ -3396,16 +3396,16 @@ define void @callee_no_irq() {
 ; RV32IZCMP-NEXT:    .cfi_offset s9, -12
 ; RV32IZCMP-NEXT:    .cfi_offset s10, -8
 ; RV32IZCMP-NEXT:    .cfi_offset s11, -4
-; RV32IZCMP-NEXT:    lui t0, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-NEXT:    lui a4, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32IZCMP-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    lw a0, 20(a5)
@@ -3425,28 +3425,28 @@ define void @callee_no_irq() {
 ; RV32IZCMP-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-NEXT:    lw ra, 76(a5)
 ; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw a7, 112(a5)
-; RV32IZCMP-NEXT:    lw s0, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a6, 96(a5)
-; RV32IZCMP-NEXT:    lw a4, 100(a5)
-; RV32IZCMP-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-NEXT:    sw s0, 116(a5)
-; RV32IZCMP-NEXT:    sw a7, 112(a5)
-; RV32IZCMP-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-NEXT:    sw a4, 100(a5)
-; RV32IZCMP-NEXT:    sw a6, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
+; RV32IZCMP-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-NEXT:    lw t1, 88(a5)
+; RV32IZCMP-NEXT:    lw t0, 92(a5)
+; RV32IZCMP-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-NEXT:    lw s0, 100(a5)
+; RV32IZCMP-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-NEXT:    lw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw t3, 124(a5)
+; RV32IZCMP-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-NEXT:    sw s0, 100(a5)
+; RV32IZCMP-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-NEXT:    sw t0, 92(a5)
+; RV32IZCMP-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-NEXT:    sw t2, 84(a5)
 ; RV32IZCMP-NEXT:    sw s1, 80(a5)
 ; RV32IZCMP-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-NEXT:    sw s11, 72(a5)
@@ -3467,13 +3467,13 @@ define void @callee_no_irq() {
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 80
 ;
 ; RV64IZCMP-LABEL: callee_no_irq:
@@ -3493,16 +3493,16 @@ define void @callee_no_irq() {
 ; RV64IZCMP-NEXT:    .cfi_offset s9, -24
 ; RV64IZCMP-NEXT:    .cfi_offset s10, -16
 ; RV64IZCMP-NEXT:    .cfi_offset s11, -8
-; RV64IZCMP-NEXT:    lui t0, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-NEXT:    lui a4, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64IZCMP-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    lw a0, 20(a5)
@@ -3522,28 +3522,28 @@ define void @callee_no_irq() {
 ; RV64IZCMP-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-NEXT:    lw ra, 76(a5)
 ; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw a7, 112(a5)
-; RV64IZCMP-NEXT:    lw s0, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a6, 96(a5)
-; RV64IZCMP-NEXT:    lw a4, 100(a5)
-; RV64IZCMP-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-NEXT:    sw s0, 116(a5)
-; RV64IZCMP-NEXT:    sw a7, 112(a5)
-; RV64IZCMP-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-NEXT:    sw a4, 100(a5)
-; RV64IZCMP-NEXT:    sw a6, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
+; RV64IZCMP-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-NEXT:    lw t1, 88(a5)
+; RV64IZCMP-NEXT:    lw t0, 92(a5)
+; RV64IZCMP-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-NEXT:    lw s0, 100(a5)
+; RV64IZCMP-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-NEXT:    lw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw t3, 124(a5)
+; RV64IZCMP-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-NEXT:    sw s0, 100(a5)
+; RV64IZCMP-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-NEXT:    sw t0, 92(a5)
+; RV64IZCMP-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-NEXT:    sw t2, 84(a5)
 ; RV64IZCMP-NEXT:    sw s1, 80(a5)
 ; RV64IZCMP-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-NEXT:    sw s11, 72(a5)
@@ -3564,13 +3564,13 @@ define void @callee_no_irq() {
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
@@ -3590,16 +3590,16 @@ define void @callee_no_irq() {
 ; RV32IZCMP-SR-NEXT:    .cfi_offset s9, -12
 ; RV32IZCMP-SR-NEXT:    .cfi_offset s10, -8
 ; RV32IZCMP-SR-NEXT:    .cfi_offset s11, -4
-; RV32IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-SR-NEXT:    lui a4, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -3619,28 +3619,28 @@ define void @callee_no_irq() {
 ; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
 ; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
 ; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 112(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a6, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw a6, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, 84(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, 88(a5)
+; RV32IZCMP-SR-NEXT:    lw t0, 92(a5)
+; RV32IZCMP-SR-NEXT:    lw a7, 96(a5)
+; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a5)
+; RV32IZCMP-SR-NEXT:    lw a3, 108(a5)
+; RV32IZCMP-SR-NEXT:    lw a2, 112(a5)
+; RV32IZCMP-SR-NEXT:    lw a1, 116(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 120(a5)
+; RV32IZCMP-SR-NEXT:    lw t3, 124(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, 124(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 120(a5)
+; RV32IZCMP-SR-NEXT:    sw a1, 116(a5)
+; RV32IZCMP-SR-NEXT:    sw a2, 112(a5)
+; RV32IZCMP-SR-NEXT:    sw a3, 108(a5)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a5)
+; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
+; RV32IZCMP-SR-NEXT:    sw a7, 96(a5)
+; RV32IZCMP-SR-NEXT:    sw t0, 92(a5)
+; RV32IZCMP-SR-NEXT:    sw t1, 88(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, 84(a5)
 ; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
 ; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
 ; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
@@ -3661,13 +3661,13 @@ define void @callee_no_irq() {
 ; RV32IZCMP-SR-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 80
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
@@ -3687,16 +3687,16 @@ define void @callee_no_irq() {
 ; RV64IZCMP-SR-NEXT:    .cfi_offset s9, -24
 ; RV64IZCMP-SR-NEXT:    .cfi_offset s10, -16
 ; RV64IZCMP-SR-NEXT:    .cfi_offset s11, -8
-; RV64IZCMP-SR-NEXT:    lui t0, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-SR-NEXT:    lui a4, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, t0, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
@@ -3716,28 +3716,28 @@ define void @callee_no_irq() {
 ; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
 ; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
 ; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 112(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a6, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw a6, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, 84(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, 88(a5)
+; RV64IZCMP-SR-NEXT:    lw t0, 92(a5)
+; RV64IZCMP-SR-NEXT:    lw a7, 96(a5)
+; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a5)
+; RV64IZCMP-SR-NEXT:    lw a3, 108(a5)
+; RV64IZCMP-SR-NEXT:    lw a2, 112(a5)
+; RV64IZCMP-SR-NEXT:    lw a1, 116(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 120(a5)
+; RV64IZCMP-SR-NEXT:    lw t3, 124(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, 124(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 120(a5)
+; RV64IZCMP-SR-NEXT:    sw a1, 116(a5)
+; RV64IZCMP-SR-NEXT:    sw a2, 112(a5)
+; RV64IZCMP-SR-NEXT:    sw a3, 108(a5)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a5)
+; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
+; RV64IZCMP-SR-NEXT:    sw a7, 96(a5)
+; RV64IZCMP-SR-NEXT:    sw t0, 92(a5)
+; RV64IZCMP-SR-NEXT:    sw t1, 88(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, 84(a5)
 ; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
 ; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
 ; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
@@ -3758,13 +3758,13 @@ define void @callee_no_irq() {
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64IZCMP-SR-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(t0)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
@@ -3797,16 +3797,16 @@ define void @callee_no_irq() {
 ; RV32I-NEXT:    .cfi_offset s9, -44
 ; RV32I-NEXT:    .cfi_offset s10, -48
 ; RV32I-NEXT:    .cfi_offset s11, -52
-; RV32I-NEXT:    lui a7, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a7)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a7, %lo(var_test_irq)
+; RV32I-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV32I-NEXT:    lw a0, 16(a5)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a0, 20(a5)
@@ -3829,22 +3829,22 @@ define void @callee_no_irq() {
 ; RV32I-NEXT:    lw s8, 84(a5)
 ; RV32I-NEXT:    lw s9, 88(a5)
 ; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 112(a5)
-; RV32I-NEXT:    lw ra, 116(a5)
-; RV32I-NEXT:    lw a3, 120(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a6, 96(a5)
-; RV32I-NEXT:    lw a4, 100(a5)
-; RV32I-NEXT:    lw a2, 104(a5)
-; RV32I-NEXT:    lw a1, 108(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a3, 120(a5)
-; RV32I-NEXT:    sw ra, 116(a5)
-; RV32I-NEXT:    sw s11, 112(a5)
-; RV32I-NEXT:    sw a1, 108(a5)
-; RV32I-NEXT:    sw a2, 104(a5)
-; RV32I-NEXT:    sw a4, 100(a5)
-; RV32I-NEXT:    sw a6, 96(a5)
+; RV32I-NEXT:    lw s11, 96(a5)
+; RV32I-NEXT:    lw ra, 100(a5)
+; RV32I-NEXT:    lw a6, 104(a5)
+; RV32I-NEXT:    lw a3, 108(a5)
+; RV32I-NEXT:    lw a2, 112(a5)
+; RV32I-NEXT:    lw a1, 116(a5)
+; RV32I-NEXT:    lw a0, 120(a5)
+; RV32I-NEXT:    lw a7, 124(a5)
+; RV32I-NEXT:    sw a7, 124(a5)
+; RV32I-NEXT:    sw a0, 120(a5)
+; RV32I-NEXT:    sw a1, 116(a5)
+; RV32I-NEXT:    sw a2, 112(a5)
+; RV32I-NEXT:    sw a3, 108(a5)
+; RV32I-NEXT:    sw a6, 104(a5)
+; RV32I-NEXT:    sw ra, 100(a5)
+; RV32I-NEXT:    sw s11, 96(a5)
 ; RV32I-NEXT:    sw s10, 92(a5)
 ; RV32I-NEXT:    sw s9, 88(a5)
 ; RV32I-NEXT:    sw s8, 84(a5)
@@ -3868,13 +3868,13 @@ define void @callee_no_irq() {
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sw a0, 16(a5)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a7)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -3935,16 +3935,16 @@ define void @callee_no_irq() {
 ; RV64I-NEXT:    .cfi_offset s9, -88
 ; RV64I-NEXT:    .cfi_offset s10, -96
 ; RV64I-NEXT:    .cfi_offset s11, -104
-; RV64I-NEXT:    lui a7, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a7)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a4)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a7)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a4)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a7, %lo(var_test_irq)
+; RV64I-NEXT:    addi a5, a4, %lo(var_test_irq)
 ; RV64I-NEXT:    lw a0, 16(a5)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lw a0, 20(a5)
@@ -3967,22 +3967,22 @@ define void @callee_no_irq() {
 ; RV64I-NEXT:    lw s8, 84(a5)
 ; RV64I-NEXT:    lw s9, 88(a5)
 ; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 112(a5)
-; RV64I-NEXT:    lw ra, 116(a5)
-; RV64I-NEXT:    lw a3, 120(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a6, 96(a5)
-; RV64I-NEXT:    lw a4, 100(a5)
-; RV64I-NEXT:    lw a2, 104(a5)
-; RV64I-NEXT:    lw a1, 108(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a3, 120(a5)
-; RV64I-NEXT:    sw ra, 116(a5)
-; RV64I-NEXT:    sw s11, 112(a5)
-; RV64I-NEXT:    sw a1, 108(a5)
-; RV64I-NEXT:    sw a2, 104(a5)
-; RV64I-NEXT:    sw a4, 100(a5)
-; RV64I-NEXT:    sw a6, 96(a5)
+; RV64I-NEXT:    lw s11, 96(a5)
+; RV64I-NEXT:    lw ra, 100(a5)
+; RV64I-NEXT:    lw a6, 104(a5)
+; RV64I-NEXT:    lw a3, 108(a5)
+; RV64I-NEXT:    lw a2, 112(a5)
+; RV64I-NEXT:    lw a1, 116(a5)
+; RV64I-NEXT:    lw a0, 120(a5)
+; RV64I-NEXT:    lw a7, 124(a5)
+; RV64I-NEXT:    sw a7, 124(a5)
+; RV64I-NEXT:    sw a0, 120(a5)
+; RV64I-NEXT:    sw a1, 116(a5)
+; RV64I-NEXT:    sw a2, 112(a5)
+; RV64I-NEXT:    sw a3, 108(a5)
+; RV64I-NEXT:    sw a6, 104(a5)
+; RV64I-NEXT:    sw ra, 100(a5)
+; RV64I-NEXT:    sw s11, 96(a5)
 ; RV64I-NEXT:    sw s10, 92(a5)
 ; RV64I-NEXT:    sw s9, 88(a5)
 ; RV64I-NEXT:    sw s8, 84(a5)
@@ -4006,13 +4006,13 @@ define void @callee_no_irq() {
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sw a0, 16(a5)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a4)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a7)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 32261ee47164e..c53e6dc3b8089 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -50,8 +50,8 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    blez a1, .LBB1_7
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a3, 1
 ; CHECK-NEXT:    andi a2, a1, 1
+; CHECK-NEXT:    li a3, 1
 ; CHECK-NEXT:    bne a1, a3, .LBB1_3
 ; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    li a3, 0
diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index 634cca5dcdb71..5522e3c9a0fb9 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -119,8 +119,8 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
 define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: rotl_64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    sll a4, a0, a2
+; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    bltz a5, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a4
@@ -167,8 +167,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV32ZBB-LABEL: rotl_64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    sll a4, a0, a2
+; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    bltz a5, .LBB2_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a4
@@ -212,8 +212,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: rotl_64:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    sll a4, a0, a2
+; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB2_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a4
@@ -267,8 +267,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: rotr_64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    srl a4, a1, a2
+; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    bltz a5, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a4
@@ -315,8 +315,8 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV32ZBB-LABEL: rotr_64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    srl a4, a1, a2
+; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    bltz a5, .LBB3_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a4
@@ -360,8 +360,8 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: rotr_64:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    srl a4, a1, a2
+; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB3_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a4
@@ -707,8 +707,8 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
 define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: rotl_64_mask:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    sll a4, a0, a2
+; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    bltz a5, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a4
@@ -720,24 +720,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    srl a6, a7, a6
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB10_3:
-; RV32I-NEXT:    srai t0, a5, 31
+; RV32I-NEXT:    srai a6, a5, 31
 ; RV32I-NEXT:    neg a5, a2
-; RV32I-NEXT:    andi a7, a5, 63
-; RV32I-NEXT:    addi a6, a7, -32
-; RV32I-NEXT:    and a2, t0, a4
-; RV32I-NEXT:    bltz a6, .LBB10_5
+; RV32I-NEXT:    and a2, a6, a4
+; RV32I-NEXT:    andi a6, a5, 63
+; RV32I-NEXT:    addi a4, a6, -32
+; RV32I-NEXT:    bltz a4, .LBB10_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    srl a0, a1, a6
 ; RV32I-NEXT:    j .LBB10_6
 ; RV32I-NEXT:  .LBB10_5:
 ; RV32I-NEXT:    srl a0, a0, a5
-; RV32I-NEXT:    not a4, a7
+; RV32I-NEXT:    not a6, a6
 ; RV32I-NEXT:    slli a7, a1, 1
-; RV32I-NEXT:    sll a4, a7, a4
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    srl a1, a1, a5
-; RV32I-NEXT:    srai a4, a6, 31
+; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a1, a4, a1
 ; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    or a0, a2, a0
@@ -753,8 +753,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32ZBB-LABEL: rotl_64_mask:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    sll a4, a0, a2
+; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    bltz a5, .LBB10_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a4
@@ -766,24 +766,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    srl a6, a7, a6
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB10_3:
-; RV32ZBB-NEXT:    srai t0, a5, 31
+; RV32ZBB-NEXT:    srai a6, a5, 31
 ; RV32ZBB-NEXT:    neg a5, a2
-; RV32ZBB-NEXT:    andi a7, a5, 63
-; RV32ZBB-NEXT:    addi a6, a7, -32
-; RV32ZBB-NEXT:    and a2, t0, a4
-; RV32ZBB-NEXT:    bltz a6, .LBB10_5
+; RV32ZBB-NEXT:    and a2, a6, a4
+; RV32ZBB-NEXT:    andi a6, a5, 63
+; RV32ZBB-NEXT:    addi a4, a6, -32
+; RV32ZBB-NEXT:    bltz a4, .LBB10_5
 ; RV32ZBB-NEXT:  # %bb.4:
-; RV32ZBB-NEXT:    srl a0, a1, a7
+; RV32ZBB-NEXT:    srl a0, a1, a6
 ; RV32ZBB-NEXT:    j .LBB10_6
 ; RV32ZBB-NEXT:  .LBB10_5:
 ; RV32ZBB-NEXT:    srl a0, a0, a5
-; RV32ZBB-NEXT:    not a4, a7
+; RV32ZBB-NEXT:    not a6, a6
 ; RV32ZBB-NEXT:    slli a7, a1, 1
-; RV32ZBB-NEXT:    sll a4, a7, a4
-; RV32ZBB-NEXT:    or a0, a0, a4
+; RV32ZBB-NEXT:    sll a6, a7, a6
+; RV32ZBB-NEXT:    or a0, a0, a6
 ; RV32ZBB-NEXT:  .LBB10_6:
 ; RV32ZBB-NEXT:    srl a1, a1, a5
-; RV32ZBB-NEXT:    srai a4, a6, 31
+; RV32ZBB-NEXT:    srai a4, a4, 31
 ; RV32ZBB-NEXT:    and a1, a4, a1
 ; RV32ZBB-NEXT:    or a1, a3, a1
 ; RV32ZBB-NEXT:    or a0, a2, a0
@@ -796,8 +796,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: rotl_64_mask:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    sll a4, a0, a2
+; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB10_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a4
@@ -809,24 +809,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    srl a6, a7, a6
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB10_3:
-; RV32XTHEADBB-NEXT:    srai t0, a5, 31
+; RV32XTHEADBB-NEXT:    srai a6, a5, 31
 ; RV32XTHEADBB-NEXT:    neg a5, a2
-; RV32XTHEADBB-NEXT:    andi a7, a5, 63
-; RV32XTHEADBB-NEXT:    addi a6, a7, -32
-; RV32XTHEADBB-NEXT:    and a2, t0, a4
-; RV32XTHEADBB-NEXT:    bltz a6, .LBB10_5
+; RV32XTHEADBB-NEXT:    and a2, a6, a4
+; RV32XTHEADBB-NEXT:    andi a6, a5, 63
+; RV32XTHEADBB-NEXT:    addi a4, a6, -32
+; RV32XTHEADBB-NEXT:    bltz a4, .LBB10_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
-; RV32XTHEADBB-NEXT:    srl a0, a1, a7
+; RV32XTHEADBB-NEXT:    srl a0, a1, a6
 ; RV32XTHEADBB-NEXT:    j .LBB10_6
 ; RV32XTHEADBB-NEXT:  .LBB10_5:
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a5
-; RV32XTHEADBB-NEXT:    not a4, a7
+; RV32XTHEADBB-NEXT:    not a6, a6
 ; RV32XTHEADBB-NEXT:    slli a7, a1, 1
-; RV32XTHEADBB-NEXT:    sll a4, a7, a4
-; RV32XTHEADBB-NEXT:    or a0, a0, a4
+; RV32XTHEADBB-NEXT:    sll a6, a7, a6
+; RV32XTHEADBB-NEXT:    or a0, a0, a6
 ; RV32XTHEADBB-NEXT:  .LBB10_6:
 ; RV32XTHEADBB-NEXT:    srl a1, a1, a5
-; RV32XTHEADBB-NEXT:    srai a4, a6, 31
+; RV32XTHEADBB-NEXT:    srai a4, a4, 31
 ; RV32XTHEADBB-NEXT:    and a1, a4, a1
 ; RV32XTHEADBB-NEXT:    or a1, a3, a1
 ; RV32XTHEADBB-NEXT:    or a0, a2, a0
@@ -863,12 +863,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    srl a3, a6, a3
 ; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:  .LBB11_3:
-; RV32I-NEXT:    sll a7, a0, a2
-; RV32I-NEXT:    srai t0, a4, 31
+; RV32I-NEXT:    sll a5, a0, a2
+; RV32I-NEXT:    srai a6, a4, 31
 ; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    and a2, a6, a5
 ; RV32I-NEXT:    andi a6, a4, 63
 ; RV32I-NEXT:    addi a5, a6, -32
-; RV32I-NEXT:    and a2, t0, a7
 ; RV32I-NEXT:    bltz a5, .LBB11_5
 ; RV32I-NEXT:  # %bb.4:
 ; RV32I-NEXT:    srl a0, a1, a6
@@ -910,12 +910,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    srl a3, a6, a3
 ; RV32ZBB-NEXT:    or a3, a5, a3
 ; RV32ZBB-NEXT:  .LBB11_3:
-; RV32ZBB-NEXT:    sll a7, a0, a2
-; RV32ZBB-NEXT:    srai t0, a4, 31
+; RV32ZBB-NEXT:    sll a5, a0, a2
+; RV32ZBB-NEXT:    srai a6, a4, 31
 ; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    and a2, a6, a5
 ; RV32ZBB-NEXT:    andi a6, a4, 63
 ; RV32ZBB-NEXT:    addi a5, a6, -32
-; RV32ZBB-NEXT:    and a2, t0, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB11_5
 ; RV32ZBB-NEXT:  # %bb.4:
 ; RV32ZBB-NEXT:    srl a0, a1, a6
@@ -954,12 +954,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    srl a3, a6, a3
 ; RV32XTHEADBB-NEXT:    or a3, a5, a3
 ; RV32XTHEADBB-NEXT:  .LBB11_3:
-; RV32XTHEADBB-NEXT:    sll a7, a0, a2
-; RV32XTHEADBB-NEXT:    srai t0, a4, 31
+; RV32XTHEADBB-NEXT:    sll a5, a0, a2
+; RV32XTHEADBB-NEXT:    srai a6, a4, 31
 ; RV32XTHEADBB-NEXT:    neg a4, a2
+; RV32XTHEADBB-NEXT:    and a2, a6, a5
 ; RV32XTHEADBB-NEXT:    andi a6, a4, 63
 ; RV32XTHEADBB-NEXT:    addi a5, a6, -32
-; RV32XTHEADBB-NEXT:    and a2, t0, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB11_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
 ; RV32XTHEADBB-NEXT:    srl a0, a1, a6
@@ -1042,8 +1042,8 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
 define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: rotr_64_mask:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    srl a4, a1, a2
+; RV32I-NEXT:    addi a5, a2, -32
 ; RV32I-NEXT:    bltz a5, .LBB13_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a4
@@ -1055,24 +1055,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB13_3:
-; RV32I-NEXT:    srai t0, a5, 31
+; RV32I-NEXT:    srai a6, a5, 31
 ; RV32I-NEXT:    neg a5, a2
-; RV32I-NEXT:    andi a7, a5, 63
-; RV32I-NEXT:    addi a6, a7, -32
-; RV32I-NEXT:    and a2, t0, a4
-; RV32I-NEXT:    bltz a6, .LBB13_5
+; RV32I-NEXT:    and a2, a6, a4
+; RV32I-NEXT:    andi a6, a5, 63
+; RV32I-NEXT:    addi a4, a6, -32
+; RV32I-NEXT:    bltz a4, .LBB13_5
 ; RV32I-NEXT:  # %bb.4:
-; RV32I-NEXT:    sll a1, a0, a7
+; RV32I-NEXT:    sll a1, a0, a6
 ; RV32I-NEXT:    j .LBB13_6
 ; RV32I-NEXT:  .LBB13_5:
 ; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    not a4, a7
+; RV32I-NEXT:    not a6, a6
 ; RV32I-NEXT:    srli a7, a0, 1
-; RV32I-NEXT:    srl a4, a7, a4
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    srl a6, a7, a6
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:  .LBB13_6:
 ; RV32I-NEXT:    sll a0, a0, a5
-; RV32I-NEXT:    srai a4, a6, 31
+; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a0, a4, a0
 ; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    or a1, a2, a1
@@ -1088,8 +1088,8 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32ZBB-LABEL: rotr_64_mask:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    srl a4, a1, a2
+; RV32ZBB-NEXT:    addi a5, a2, -32
 ; RV32ZBB-NEXT:    bltz a5, .LBB13_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a4
@@ -1101,24 +1101,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    sll a6, a7, a6
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB13_3:
-; RV32ZBB-NEXT:    srai t0, a5, 31
+; RV32ZBB-NEXT:    srai a6, a5, 31
 ; RV32ZBB-NEXT:    neg a5, a2
-; RV32ZBB-NEXT:    andi a7, a5, 63
-; RV32ZBB-NEXT:    addi a6, a7, -32
-; RV32ZBB-NEXT:    and a2, t0, a4
-; RV32ZBB-NEXT:    bltz a6, .LBB13_5
+; RV32ZBB-NEXT:    and a2, a6, a4
+; RV32ZBB-NEXT:    andi a6, a5, 63
+; RV32ZBB-NEXT:    addi a4, a6, -32
+; RV32ZBB-NEXT:    bltz a4, .LBB13_5
 ; RV32ZBB-NEXT:  # %bb.4:
-; RV32ZBB-NEXT:    sll a1, a0, a7
+; RV32ZBB-NEXT:    sll a1, a0, a6
 ; RV32ZBB-NEXT:    j .LBB13_6
 ; RV32ZBB-NEXT:  .LBB13_5:
 ; RV32ZBB-NEXT:    sll a1, a1, a5
-; RV32ZBB-NEXT:    not a4, a7
+; RV32ZBB-NEXT:    not a6, a6
 ; RV32ZBB-NEXT:    srli a7, a0, 1
-; RV32ZBB-NEXT:    srl a4, a7, a4
-; RV32ZBB-NEXT:    or a1, a1, a4
+; RV32ZBB-NEXT:    srl a6, a7, a6
+; RV32ZBB-NEXT:    or a1, a1, a6
 ; RV32ZBB-NEXT:  .LBB13_6:
 ; RV32ZBB-NEXT:    sll a0, a0, a5
-; RV32ZBB-NEXT:    srai a4, a6, 31
+; RV32ZBB-NEXT:    srai a4, a4, 31
 ; RV32ZBB-NEXT:    and a0, a4, a0
 ; RV32ZBB-NEXT:    or a0, a3, a0
 ; RV32ZBB-NEXT:    or a1, a2, a1
@@ -1131,8 +1131,8 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: rotr_64_mask:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    srl a4, a1, a2
+; RV32XTHEADBB-NEXT:    addi a5, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB13_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a4
@@ -1144,24 +1144,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    sll a6, a7, a6
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB13_3:
-; RV32XTHEADBB-NEXT:    srai t0, a5, 31
+; RV32XTHEADBB-NEXT:    srai a6, a5, 31
 ; RV32XTHEADBB-NEXT:    neg a5, a2
-; RV32XTHEADBB-NEXT:    andi a7, a5, 63
-; RV32XTHEADBB-NEXT:    addi a6, a7, -32
-; RV32XTHEADBB-NEXT:    and a2, t0, a4
-; RV32XTHEADBB-NEXT:    bltz a6, .LBB13_5
+; RV32XTHEADBB-NEXT:    and a2, a6, a4
+; RV32XTHEADBB-NEXT:    andi a6, a5, 63
+; RV32XTHEADBB-NEXT:    addi a4, a6, -32
+; RV32XTHEADBB-NEXT:    bltz a4, .LBB13_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
-; RV32XTHEADBB-NEXT:    sll a1, a0, a7
+; RV32XTHEADBB-NEXT:    sll a1, a0, a6
 ; RV32XTHEADBB-NEXT:    j .LBB13_6
 ; RV32XTHEADBB-NEXT:  .LBB13_5:
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a5
-; RV32XTHEADBB-NEXT:    not a4, a7
+; RV32XTHEADBB-NEXT:    not a6, a6
 ; RV32XTHEADBB-NEXT:    srli a7, a0, 1
-; RV32XTHEADBB-NEXT:    srl a4, a7, a4
-; RV32XTHEADBB-NEXT:    or a1, a1, a4
+; RV32XTHEADBB-NEXT:    srl a6, a7, a6
+; RV32XTHEADBB-NEXT:    or a1, a1, a6
 ; RV32XTHEADBB-NEXT:  .LBB13_6:
 ; RV32XTHEADBB-NEXT:    sll a0, a0, a5
-; RV32XTHEADBB-NEXT:    srai a4, a6, 31
+; RV32XTHEADBB-NEXT:    srai a4, a4, 31
 ; RV32XTHEADBB-NEXT:    and a0, a4, a0
 ; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    or a1, a2, a1
@@ -1198,12 +1198,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    sll a3, a6, a3
 ; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:  .LBB14_3:
-; RV32I-NEXT:    srl a7, a1, a2
-; RV32I-NEXT:    srai t0, a4, 31
+; RV32I-NEXT:    srl a5, a1, a2
+; RV32I-NEXT:    srai a6, a4, 31
 ; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    and a2, a6, a5
 ; RV32I-NEXT:    andi a6, a4, 63
 ; RV32I-NEXT:    addi a5, a6, -32
-; RV32I-NEXT:    and a2, t0, a7
 ; RV32I-NEXT:    bltz a5, .LBB14_5
 ; RV32I-NEXT:  # %bb.4:
 ; RV32I-NEXT:    sll a1, a0, a6
@@ -1245,12 +1245,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    sll a3, a6, a3
 ; RV32ZBB-NEXT:    or a3, a5, a3
 ; RV32ZBB-NEXT:  .LBB14_3:
-; RV32ZBB-NEXT:    srl a7, a1, a2
-; RV32ZBB-NEXT:    srai t0, a4, 31
+; RV32ZBB-NEXT:    srl a5, a1, a2
+; RV32ZBB-NEXT:    srai a6, a4, 31
 ; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    and a2, a6, a5
 ; RV32ZBB-NEXT:    andi a6, a4, 63
 ; RV32ZBB-NEXT:    addi a5, a6, -32
-; RV32ZBB-NEXT:    and a2, t0, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB14_5
 ; RV32ZBB-NEXT:  # %bb.4:
 ; RV32ZBB-NEXT:    sll a1, a0, a6
@@ -1289,12 +1289,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    sll a3, a6, a3
 ; RV32XTHEADBB-NEXT:    or a3, a5, a3
 ; RV32XTHEADBB-NEXT:  .LBB14_3:
-; RV32XTHEADBB-NEXT:    srl a7, a1, a2
-; RV32XTHEADBB-NEXT:    srai t0, a4, 31
+; RV32XTHEADBB-NEXT:    srl a5, a1, a2
+; RV32XTHEADBB-NEXT:    srai a6, a4, 31
 ; RV32XTHEADBB-NEXT:    neg a4, a2
+; RV32XTHEADBB-NEXT:    and a2, a6, a5
 ; RV32XTHEADBB-NEXT:    andi a6, a4, 63
 ; RV32XTHEADBB-NEXT:    addi a5, a6, -32
-; RV32XTHEADBB-NEXT:    and a2, t0, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB14_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
 ; RV32XTHEADBB-NEXT:    sll a1, a0, a6
@@ -1458,11 +1458,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-NEXT:    not t0, a4
 ; RV32I-NEXT:    sll t1, a1, a4
 ; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    srl a6, a0, t0
-; RV32I-NEXT:    srl t0, a1, t0
+; RV32I-NEXT:    srl a0, a0, t0
+; RV32I-NEXT:    srl a6, a1, t0
+; RV32I-NEXT:    or a1, a7, a0
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    addi a0, a5, -32
-; RV32I-NEXT:    or a1, a7, a6
-; RV32I-NEXT:    or a6, t1, t0
 ; RV32I-NEXT:    bltz a0, .LBB17_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    sll a3, a2, a5
@@ -1512,11 +1512,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-NEXT:    not t0, a4
 ; RV32ZBB-NEXT:    sll t1, a1, a4
 ; RV32ZBB-NEXT:    srli a1, a6, 1
-; RV32ZBB-NEXT:    srl a6, a0, t0
-; RV32ZBB-NEXT:    srl t0, a1, t0
+; RV32ZBB-NEXT:    srl a0, a0, t0
+; RV32ZBB-NEXT:    srl a6, a1, t0
+; RV32ZBB-NEXT:    or a1, a7, a0
+; RV32ZBB-NEXT:    or a6, t1, a6
 ; RV32ZBB-NEXT:    addi a0, a5, -32
-; RV32ZBB-NEXT:    or a1, a7, a6
-; RV32ZBB-NEXT:    or a6, t1, t0
 ; RV32ZBB-NEXT:    bltz a0, .LBB17_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    sll a3, a2, a5
@@ -1562,11 +1562,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32XTHEADBB-NEXT:    not t0, a4
 ; RV32XTHEADBB-NEXT:    sll t1, a1, a4
 ; RV32XTHEADBB-NEXT:    srli a1, a6, 1
-; RV32XTHEADBB-NEXT:    srl a6, a0, t0
-; RV32XTHEADBB-NEXT:    srl t0, a1, t0
+; RV32XTHEADBB-NEXT:    srl a0, a0, t0
+; RV32XTHEADBB-NEXT:    srl a6, a1, t0
+; RV32XTHEADBB-NEXT:    or a1, a7, a0
+; RV32XTHEADBB-NEXT:    or a6, t1, a6
 ; RV32XTHEADBB-NEXT:    addi a0, a5, -32
-; RV32XTHEADBB-NEXT:    or a1, a7, a6
-; RV32XTHEADBB-NEXT:    or a6, t1, t0
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB17_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    sll a3, a2, a5
@@ -1683,13 +1683,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-NEXT:  .LBB19_4:
 ; RV32I-NEXT:    slli a1, a0, 1
 ; RV32I-NEXT:    not t0, a4
-; RV32I-NEXT:    srl t1, a0, a4
+; RV32I-NEXT:    srl a0, a0, a4
 ; RV32I-NEXT:    slli a6, a6, 1
 ; RV32I-NEXT:    sll a1, a1, t0
 ; RV32I-NEXT:    sll a6, a6, t0
-; RV32I-NEXT:    addi a0, a5, -32
 ; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a6, a6, t1
+; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    addi a0, a5, -32
 ; RV32I-NEXT:    bltz a0, .LBB19_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    sll a3, a2, a5
@@ -1736,13 +1736,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-NEXT:  .LBB19_4:
 ; RV32ZBB-NEXT:    slli a1, a0, 1
 ; RV32ZBB-NEXT:    not t0, a4
-; RV32ZBB-NEXT:    srl t1, a0, a4
+; RV32ZBB-NEXT:    srl a0, a0, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
 ; RV32ZBB-NEXT:    sll a1, a1, t0
 ; RV32ZBB-NEXT:    sll a6, a6, t0
-; RV32ZBB-NEXT:    addi a0, a5, -32
 ; RV32ZBB-NEXT:    or a1, a1, a7
-; RV32ZBB-NEXT:    or a6, a6, t1
+; RV32ZBB-NEXT:    or a6, a6, a0
+; RV32ZBB-NEXT:    addi a0, a5, -32
 ; RV32ZBB-NEXT:    bltz a0, .LBB19_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    sll a3, a2, a5
@@ -1786,13 +1786,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32XTHEADBB-NEXT:  .LBB19_4:
 ; RV32XTHEADBB-NEXT:    slli a1, a0, 1
 ; RV32XTHEADBB-NEXT:    not t0, a4
-; RV32XTHEADBB-NEXT:    srl t1, a0, a4
+; RV32XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
 ; RV32XTHEADBB-NEXT:    sll a1, a1, t0
 ; RV32XTHEADBB-NEXT:    sll a6, a6, t0
-; RV32XTHEADBB-NEXT:    addi a0, a5, -32
 ; RV32XTHEADBB-NEXT:    or a1, a1, a7
-; RV32XTHEADBB-NEXT:    or a6, a6, t1
+; RV32XTHEADBB-NEXT:    or a6, a6, a0
+; RV32XTHEADBB-NEXT:    addi a0, a5, -32
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB19_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    sll a3, a2, a5
@@ -2314,8 +2314,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-LABEL: rotl_64_zext:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    addi a6, a2, -32
 ; RV32I-NEXT:    sll a5, a0, a2
+; RV32I-NEXT:    addi a6, a2, -32
 ; RV32I-NEXT:    bltz a6, .LBB24_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a5
@@ -2362,8 +2362,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-LABEL: rotl_64_zext:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    neg a4, a2
-; RV32ZBB-NEXT:    addi a6, a2, -32
 ; RV32ZBB-NEXT:    sll a5, a0, a2
+; RV32ZBB-NEXT:    addi a6, a2, -32
 ; RV32ZBB-NEXT:    bltz a6, .LBB24_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a5
@@ -2407,8 +2407,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-LABEL: rotl_64_zext:
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    addi a6, a2, -32
 ; RV32XTHEADBB-NEXT:    sll a5, a0, a2
+; RV32XTHEADBB-NEXT:    addi a6, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB24_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a5
@@ -2464,8 +2464,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-LABEL: rotr_64_zext:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    addi a6, a2, -32
 ; RV32I-NEXT:    srl a5, a1, a2
+; RV32I-NEXT:    addi a6, a2, -32
 ; RV32I-NEXT:    bltz a6, .LBB25_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a5
@@ -2512,8 +2512,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-LABEL: rotr_64_zext:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    neg a4, a2
-; RV32ZBB-NEXT:    addi a6, a2, -32
 ; RV32ZBB-NEXT:    srl a5, a1, a2
+; RV32ZBB-NEXT:    addi a6, a2, -32
 ; RV32ZBB-NEXT:    bltz a6, .LBB25_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a3, a5
@@ -2557,8 +2557,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-LABEL: rotr_64_zext:
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    addi a6, a2, -32
 ; RV32XTHEADBB-NEXT:    srl a5, a1, a2
+; RV32XTHEADBB-NEXT:    addi a6, a2, -32
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB25_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a3, a5
diff --git a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
index f14fe2665835e..3f1b2fab8bb10 100644
--- a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
@@ -42,8 +42,8 @@ define i64 @test_Pr_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    sw a0, 12(sp)
 ; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sw a0, 12(sp)
 ; CHECK-NEXT:    sw a1, 0(sp)
 ; CHECK-NEXT:    sw a3, 4(sp)
 ; CHECK-NEXT:    #APP
@@ -112,8 +112,8 @@ define i64 @test_cR_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    sw a0, 12(sp)
 ; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sw a0, 12(sp)
 ; CHECK-NEXT:    sw a1, 0(sp)
 ; CHECK-NEXT:    sw a3, 4(sp)
 ; CHECK-NEXT:    #APP
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 90a8eadb3f974..15cea807a26de 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -762,16 +762,16 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 4(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 12(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    lw a2, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
 ; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a4, a4
+; RV32ZBB-NEXT:    cpop a2, a2
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    add a2, a3, a2
-; RV32ZBB-NEXT:    add a1, a1, a4
+; RV32ZBB-NEXT:    cpop a4, a4
+; RV32ZBB-NEXT:    add a2, a2, a3
+; RV32ZBB-NEXT:    add a1, a4, a1
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
@@ -806,18 +806,18 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ult_two:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    lw a1, 0(a0)
+; RV32ZBB-NEXT:    lw a2, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 8(a0)
+; RV32ZBB-NEXT:    lw a0, 12(a0)
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    sltiu a1, a1, 2
+; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    cpop a1, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sltiu a0, a1, 2
+; RV32ZBB-NEXT:    sltiu a1, a3, 2
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ult <2 x i64> %1, <i64 2, i64 2>
@@ -849,20 +849,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ugt_one:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    lw a1, 0(a0)
+; RV32ZBB-NEXT:    lw a2, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 8(a0)
+; RV32ZBB-NEXT:    lw a0, 12(a0)
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
+; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    cpop a1, a1
+; RV32ZBB-NEXT:    add a0, a3, a0
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
-; RV32ZBB-NEXT:    xori a1, a1, 1
+; RV32ZBB-NEXT:    sltiu a2, a0, 2
+; RV32ZBB-NEXT:    xori a0, a1, 1
+; RV32ZBB-NEXT:    xori a1, a2, 1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
@@ -904,20 +904,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_eq_one:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    lw a1, 0(a0)
+; RV32ZBB-NEXT:    lw a2, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 8(a0)
+; RV32ZBB-NEXT:    lw a0, 12(a0)
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    addi a0, a0, -1
+; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    cpop a1, a1
+; RV32ZBB-NEXT:    add a0, a3, a0
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    seqz a0, a0
-; RV32ZBB-NEXT:    seqz a1, a1
+; RV32ZBB-NEXT:    addi a2, a0, -1
+; RV32ZBB-NEXT:    seqz a0, a1
+; RV32ZBB-NEXT:    seqz a1, a2
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
@@ -961,20 +961,20 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ne_one:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a1, 12(a0)
-; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 4(a0)
-; RV32ZBB-NEXT:    lw a0, 0(a0)
-; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    lw a1, 0(a0)
+; RV32ZBB-NEXT:    lw a2, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 8(a0)
+; RV32ZBB-NEXT:    lw a0, 12(a0)
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a0, a0, a3
-; RV32ZBB-NEXT:    addi a0, a0, -1
+; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    cpop a1, a1
+; RV32ZBB-NEXT:    add a0, a3, a0
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    addi a1, a1, -1
-; RV32ZBB-NEXT:    snez a0, a0
-; RV32ZBB-NEXT:    snez a1, a1
+; RV32ZBB-NEXT:    addi a2, a0, -1
+; RV32ZBB-NEXT:    snez a0, a1
+; RV32ZBB-NEXT:    snez a1, a2
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   %2 = icmp ne <2 x i64> %1, <i64 1, i64 1>
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index 1a3beeb79b85b..17ea0a32cf475 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -787,8 +787,8 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    andi a3, a0, 63
-; CHECK-NEXT:    addi a1, a3, -32
 ; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    addi a1, a3, -32
 ; CHECK-NEXT:    bltz a1, .LBB43_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sll a2, a2, a3
@@ -815,8 +815,8 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
 ; CHECK-LABEL: bset_trailing_ones_i64_no_mask:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, -1
-; CHECK-NEXT:    addi a2, a0, -32
 ; CHECK-NEXT:    sll a1, a1, a0
+; CHECK-NEXT:    addi a2, a0, -32
 ; CHECK-NEXT:    bltz a2, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
index dd49d9e3e2dce..8865f244cee1e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
@@ -122,9 +122,9 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    fsd fs0, 8(sp) # 8-byte Folded Spill
+; RV64ID-NEXT:    fmv.d fs0, fa0
 ; RV64ID-NEXT:    lui a0, %hi(.LCPI4_0)
 ; RV64ID-NEXT:    fld fa5, %lo(.LCPI4_0)(a0)
-; RV64ID-NEXT:    fmv.d fs0, fa0
 ; RV64ID-NEXT:    fle.d s0, fa5, fa0
 ; RV64ID-NEXT:    call __fixdfti
 ; RV64ID-NEXT:    li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
index ea582ac258b71..a243d9ed68a33 100644
--- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
@@ -309,14 +309,14 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZFH-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64IZFH-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64IZFH-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
 ; RV64IZFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; RV64IZFH-NEXT:    fmv.w.x fa5, zero
+; RV64IZFH-NEXT:    fle.s a1, fa5, fa0
 ; RV64IZFH-NEXT:    flw fa5, %lo(.LCPI5_0)(a0)
-; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV64IZFH-NEXT:    fmv.w.x fa4, zero
-; RV64IZFH-NEXT:    fle.s a0, fa4, fa0
-; RV64IZFH-NEXT:    flt.s a1, fa5, fa0
-; RV64IZFH-NEXT:    neg s0, a1
-; RV64IZFH-NEXT:    neg s1, a0
+; RV64IZFH-NEXT:    flt.s a0, fa5, fa0
+; RV64IZFH-NEXT:    neg s0, a0
+; RV64IZFH-NEXT:    neg s1, a1
 ; RV64IZFH-NEXT:    call __fixunssfti
 ; RV64IZFH-NEXT:    and a0, s1, a0
 ; RV64IZFH-NEXT:    and a1, s1, a1
diff --git a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
index ac455b7fac882..c1b8d0865dca8 100644
--- a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
@@ -42,8 +42,8 @@ define i128 @test_R_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi sp, sp, -32
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    sd a0, 24(sp)
 ; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sd a0, 24(sp)
 ; CHECK-NEXT:    sd a1, 0(sp)
 ; CHECK-NEXT:    sd a3, 8(sp)
 ; CHECK-NEXT:    #APP
@@ -112,8 +112,8 @@ define i128 @test_cR_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi sp, sp, -32
 ; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    sd a0, 24(sp)
 ; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    sd a0, 24(sp)
 ; CHECK-NEXT:    sd a1, 0(sp)
 ; CHECK-NEXT:    sd a3, 8(sp)
 ; CHECK-NEXT:    #APP
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index 1ec4d8ddd1d84..8379036b2d74d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -29,8 +29,8 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-NEXT:    sd a3, 16(sp)
 ; RV64-NEXT:    sd a1, 24(sp)
 ; RV64-NEXT:    addi a1, sp, 24
-; RV64-NEXT:    addi a0, sp, 8
 ; RV64-NEXT:    addi s1, sp, 8
+; RV64-NEXT:    addi a0, sp, 8
 ; RV64-NEXT:    call __clear_cache
 ; RV64-NEXT:    mv a0, s0
 ; RV64-NEXT:    jalr s1
@@ -60,8 +60,8 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-LINUX-NEXT:    sd a3, 16(sp)
 ; RV64-LINUX-NEXT:    sd a1, 24(sp)
 ; RV64-LINUX-NEXT:    addi a1, sp, 24
-; RV64-LINUX-NEXT:    addi a0, sp, 8
 ; RV64-LINUX-NEXT:    addi s1, sp, 8
+; RV64-LINUX-NEXT:    addi a0, sp, 8
 ; RV64-LINUX-NEXT:    li a2, 0
 ; RV64-LINUX-NEXT:    call __riscv_flush_icache
 ; RV64-LINUX-NEXT:    mv a0, s0
diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
index b8c43289bdfed..dd16e2beacec2 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
@@ -169,9 +169,9 @@ define signext i32 @andi_srliw(i32 signext %0, ptr %1, i32 signext %2) {
 ; CHECK-LABEL: andi_srliw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a0, -8
-; CHECK-NEXT:    srliw a4, a0, 3
+; CHECK-NEXT:    srliw a0, a0, 3
+; CHECK-NEXT:    sw a0, 0(a1)
 ; CHECK-NEXT:    addw a0, a3, a2
-; CHECK-NEXT:    sw a4, 0(a1)
 ; CHECK-NEXT:    ret
   %4 = and i32 %0, -8
   %5 = lshr i32 %0, 3
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 985837d05caa2..b87d3504ce9ff 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -106,8 +106,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
 ; RV64I-LABEL: pack_i64_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    lwu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    lwu a1, 0(a1)
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 4ade6c09fe43d..fa6ae2f8b171e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -103,13 +103,13 @@ define <8 x i1> @fv8(ptr %p, i64 %index, i64 %tc) {
 define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-LABEL: fv32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI8_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI8_0)
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v8, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
@@ -130,11 +130,8 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v17, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vle8.v v18, (a0)
+; CHECK-NEXT:    vle8.v v17, (a0)
 ; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    vsext.vf8 v8, v16
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
@@ -142,13 +139,16 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsext.vf8 v8, v17
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v17, v8, a2
+; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v17, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v8, v18
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 6
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index 9ac2775d30668..3f4a7fca33293 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -17,17 +17,17 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
 ; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    add a3, a0, a1
 ; CHECK-NEXT:    vl1re64.v v9, (a3)
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    vl1re64.v v10, (a0)
-; CHECK-NEXT:    add a2, a3, a2
-; CHECK-NEXT:    vs1r.v v8, (a3)
-; CHECK-NEXT:    vs1r.v v9, (a2)
-; CHECK-NEXT:    vs1r.v v10, (a1)
+; CHECK-NEXT:    add a3, a2, a3
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vs1r.v v10, (a3)
+; CHECK-NEXT:    vs1r.v v9, (a1)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index fb25d4e15e40e..5fecb75d847a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -17,14 +17,14 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrrs a2, vlenb, zero
 ; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vs1r.v v8, (a3)
 ; CHECK-NEXT:    vs1r.v v9, (a2)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
-; CHECK-NEXT:    vl1re64.v v9, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a3)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v9, v8
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 1ed84316d4484..d7c608fffd7a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -713,59 +713,59 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v11, v8, a1
+; RV32-NEXT:    vsrl.vx v12, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v11, v11, a0
-; RV32-NEXT:    vlse64.v v13, (a5), zero
-; RV32-NEXT:    vor.vv v10, v11, v10
-; RV32-NEXT:    vand.vx v11, v8, a0
-; RV32-NEXT:    vsll.vx v11, v11, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    vand.vx v9, v9, a4
-; RV32-NEXT:    vand.vv v12, v12, v13
-; RV32-NEXT:    vor.vv v9, v12, v9
+; RV32-NEXT:    vsll.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v9, v9, v10
+; RV32-NEXT:    vor.vv v9, v9, v11
 ; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v11, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vmv.v.x v12, a2
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v9, v9, v12
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v9, v9, v11
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -852,42 +852,42 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v14, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a0
-; RV32-NEXT:    vlse64.v v14, (a5), zero
-; RV32-NEXT:    vor.vv v12, v12, v10
-; RV32-NEXT:    vand.vx v10, v8, a0
-; RV32-NEXT:    vsll.vx v10, v10, a2
-; RV32-NEXT:    vor.vv v10, v18, v10
-; RV32-NEXT:    vsrl.vi v18, v8, 8
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vand.vv v18, v18, v14
-; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    vand.vx v18, v10, a4
+; RV32-NEXT:    vsll.vx v10, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v10, v10, v16
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vand.vv v14, v14, v16
+; RV32-NEXT:    vor.vv v14, v14, v18
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    vand.vv v14, v8, v14
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vor.vv v12, v14, v12
 ; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vmv.v.x v14, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vi v14, v14, 8
-; RV32-NEXT:    vor.vv v8, v8, v14
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v14, a1
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
@@ -895,13 +895,13 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v12, v12, v14
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vand.vv v12, v12, v14
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
@@ -993,42 +993,42 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vlse64.v v20, (a5), zero
-; RV32-NEXT:    vor.vv v16, v16, v12
-; RV32-NEXT:    vand.vx v12, v8, a0
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v12, v28, v12
-; RV32-NEXT:    vsrl.vi v28, v8, 8
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vand.vv v28, v28, v20
-; RV32-NEXT:    vor.vv v24, v28, v24
+; RV32-NEXT:    vand.vx v28, v12, a4
+; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vx v24, v8, a0
+; RV32-NEXT:    vsll.vx v24, v24, a2
+; RV32-NEXT:    vor.vv v12, v12, v24
+; RV32-NEXT:    vlse64.v v24, (a5), zero
+; RV32-NEXT:    vand.vv v20, v20, v24
+; RV32-NEXT:    vor.vv v20, v20, v28
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    vand.vv v20, v8, v20
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vor.vv v16, v20, v16
 ; RV32-NEXT:    vsetvli a3, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    vmv.v.x v20, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vi v20, v20, 8
-; RV32-NEXT:    vor.vv v8, v8, v20
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v20, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
@@ -1036,13 +1036,13 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v16, v16, v20
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vand.vv v16, v16, v20
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
@@ -1137,38 +1137,38 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    addi a5, sp, 8
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsll.vx v0, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vand.vx v24, v24, a3
 ; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vand.vx v16, v8, a3
 ; RV32-NEXT:    vsll.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a5), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a1
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24
-; RV32-NEXT:    vand.vv v16, v8, v0
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a1
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 4d34621cd5f24..e2c8bc8b29171 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1585,58 +1585,58 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    vsll.vx v11, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v11, v8, a2
-; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v13, v8, a4
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vx v13, v13, a1
+; RV32-NEXT:    vor.vv v12, v13, v12
 ; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsll.vx v13, v13, a4
-; RV32-NEXT:    vor.vv v10, v10, v13
-; RV32-NEXT:    vsrl.vi v13, v8, 8
-; RV32-NEXT:    vand.vx v9, v9, a5
-; RV32-NEXT:    vand.vv v13, v13, v12
-; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vor.vv v11, v11, v13
+; RV32-NEXT:    vlse64.v v13, (a6), zero
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v9, v9, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vmv.v.x v12, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vmv.v.x v11, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v9, v9, v12
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v9, v9, v11
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,23 +1866,23 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    lui a5, 4080
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v14, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    vsrl.vi v14, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v12, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a4
-; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v18, v8, a4
+; RV32-NEXT:    vand.vx v20, v10, a5
+; RV32-NEXT:    vand.vx v10, v18, a1
+; RV32-NEXT:    vor.vv v10, v10, v16
+; RV32-NEXT:    vand.vx v16, v8, a1
+; RV32-NEXT:    vsll.vx v16, v16, a4
+; RV32-NEXT:    vor.vv v12, v12, v16
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsll.vx v18, v18, a4
-; RV32-NEXT:    vor.vv v12, v12, v18
-; RV32-NEXT:    vsrl.vi v18, v8, 8
-; RV32-NEXT:    vand.vx v14, v14, a5
-; RV32-NEXT:    vand.vv v18, v18, v16
-; RV32-NEXT:    vor.vv v14, v18, v14
+; RV32-NEXT:    vand.vv v14, v14, v16
+; RV32-NEXT:    vor.vv v14, v14, v20
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -2148,23 +2148,23 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    lui a5, 4080
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v20, v8, 24
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v16, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a4
-; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v28, v8, a4
+; RV32-NEXT:    vand.vx v4, v12, a5
+; RV32-NEXT:    vand.vx v12, v28, a1
+; RV32-NEXT:    vor.vv v12, v12, v24
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
+; RV32-NEXT:    vor.vv v16, v16, v24
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsll.vx v28, v28, a4
-; RV32-NEXT:    vor.vv v16, v16, v28
-; RV32-NEXT:    vsrl.vi v28, v8, 8
-; RV32-NEXT:    vand.vx v20, v20, a5
-; RV32-NEXT:    vand.vv v28, v28, v24
-; RV32-NEXT:    vor.vv v20, v28, v20
+; RV32-NEXT:    vand.vv v20, v20, v24
+; RV32-NEXT:    vor.vv v20, v20, v4
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -2288,66 +2288,68 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a5, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2497,40 +2499,40 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    addi a6, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a3
 ; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v24, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v0, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v8, v24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v24, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -2673,66 +2675,68 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a5, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2882,40 +2886,40 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    addi a6, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a3
 ; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v24, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v0, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v8, v24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v24, v8, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index 2cd763afa36b7..ee8bfe8910b78 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -265,30 +265,30 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v11, v8, a1
+; RV32-NEXT:    vsrl.vx v12, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v11, v11, a0
-; RV32-NEXT:    vlse64.v v13, (a5), zero
-; RV32-NEXT:    vor.vv v10, v11, v10
-; RV32-NEXT:    vand.vx v11, v8, a0
-; RV32-NEXT:    vsll.vx v11, v11, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    vand.vx v9, v9, a4
-; RV32-NEXT:    vand.vv v12, v12, v13
-; RV32-NEXT:    vor.vv v9, v12, v9
-; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vsll.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -349,30 +349,30 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    vsrl.vx v14, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    vand.vx v14, v14, a0
-; RV32-NEXT:    vlse64.v v18, (a5), zero
-; RV32-NEXT:    vor.vv v12, v14, v12
-; RV32-NEXT:    vand.vx v14, v8, a0
-; RV32-NEXT:    vsll.vx v14, v14, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 8
 ; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    vand.vv v16, v16, v18
-; RV32-NEXT:    vor.vv v10, v16, v10
-; RV32-NEXT:    vand.vv v16, v8, v18
+; RV32-NEXT:    vsll.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    vlse64.v v18, (a5), zero
+; RV32-NEXT:    vand.vv v12, v12, v18
+; RV32-NEXT:    vor.vv v10, v12, v10
+; RV32-NEXT:    vand.vv v12, v8, v18
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -431,32 +431,32 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    vsrl.vx v20, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsll.vx v24, v8, a1
-; RV32-NEXT:    vand.vx v20, v20, a0
-; RV32-NEXT:    vlse64.v v28, (a5), zero
-; RV32-NEXT:    vor.vv v16, v20, v16
-; RV32-NEXT:    vand.vx v20, v8, a0
-; RV32-NEXT:    vsll.vx v20, v20, a2
+; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    vsll.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vand.vv v24, v24, v28
-; RV32-NEXT:    vor.vv v12, v24, v12
-; RV32-NEXT:    vand.vv v24, v8, v28
+; RV32-NEXT:    vand.vx v24, v8, a0
+; RV32-NEXT:    vsll.vx v24, v24, a2
+; RV32-NEXT:    vor.vv v24, v28, v24
+; RV32-NEXT:    vlse64.v v28, (a5), zero
+; RV32-NEXT:    vand.vv v12, v12, v28
+; RV32-NEXT:    vor.vv v12, v12, v16
+; RV32-NEXT:    vand.vv v16, v8, v28
 ; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v20, v8
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -518,38 +518,38 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    addi a5, sp, 8
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
 ; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsll.vx v0, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vand.vx v24, v24, a3
 ; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vand.vx v16, v8, a3
 ; RV32-NEXT:    vsll.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a5), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a1
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v24
-; RV32-NEXT:    vand.vv v16, v8, v0
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a1
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0c58cca0f9472..8243e103a9271 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -604,29 +604,29 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    vsll.vx v11, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsrl.vx v11, v8, a2
-; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v13, v8, a4
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vx v13, v13, a0
+; RV32-NEXT:    vor.vv v12, v13, v12
 ; RV32-NEXT:    vand.vx v13, v8, a0
-; RV32-NEXT:    vand.vx v12, v12, a0
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsll.vx v13, v13, a4
-; RV32-NEXT:    vor.vv v10, v10, v13
-; RV32-NEXT:    vsrl.vi v13, v8, 8
-; RV32-NEXT:    vand.vx v9, v9, a5
-; RV32-NEXT:    vand.vv v13, v13, v12
-; RV32-NEXT:    vor.vv v9, v13, v9
-; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vor.vv v11, v11, v13
+; RV32-NEXT:    vlse64.v v13, (a6), zero
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v9, v9, v12
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -772,29 +772,29 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v14, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsrl.vx v14, v8, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v18, v8, a4
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vx v18, v18, a0
+; RV32-NEXT:    vor.vv v16, v18, v16
 ; RV32-NEXT:    vand.vx v18, v8, a0
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsll.vx v18, v18, a4
-; RV32-NEXT:    vor.vv v12, v12, v18
-; RV32-NEXT:    vsrl.vi v18, v8, 8
-; RV32-NEXT:    vand.vx v10, v10, a5
-; RV32-NEXT:    vand.vv v18, v18, v16
-; RV32-NEXT:    vor.vv v10, v18, v10
-; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vor.vv v14, v14, v18
+; RV32-NEXT:    vlse64.v v18, (a6), zero
+; RV32-NEXT:    vand.vv v12, v12, v18
+; RV32-NEXT:    vor.vv v10, v12, v10
+; RV32-NEXT:    vand.vv v12, v8, v18
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vor.vv v10, v10, v16
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -940,29 +940,29 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v16, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    vsll.vx v20, v8, a2
 ; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vsrl.vx v20, v8, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v28, v8, a4
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vx v28, v28, a0
+; RV32-NEXT:    vor.vv v24, v28, v24
 ; RV32-NEXT:    vand.vx v28, v8, a0
-; RV32-NEXT:    vand.vx v24, v24, a0
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsll.vx v28, v28, a4
-; RV32-NEXT:    vor.vv v16, v16, v28
-; RV32-NEXT:    vsrl.vi v28, v8, 8
-; RV32-NEXT:    vand.vx v12, v12, a5
-; RV32-NEXT:    vand.vv v28, v28, v24
-; RV32-NEXT:    vor.vv v12, v28, v12
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vor.vv v20, v20, v28
+; RV32-NEXT:    vlse64.v v28, (a6), zero
+; RV32-NEXT:    vand.vv v16, v16, v28
+; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vand.vv v16, v8, v28
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v20, v8
+; RV32-NEXT:    vor.vv v12, v12, v24
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1022,59 +1022,61 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
+; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    addi a5, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a2, -256
+; RV32-NEXT:    vand.vx v24, v16, a0, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v8, v16, a4, v0.t
+; RV32-NEXT:    vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a5), zero
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 8, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v8, v16, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v16, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
@@ -1174,45 +1176,46 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    addi a2, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a0
+; RV32-NEXT:    vand.vx v0, v0, a2
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a2
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1292,59 +1295,61 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
+; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 4080
 ; RV32-NEXT:    addi a5, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a0, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
-; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a5), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a2, -256
+; RV32-NEXT:    vand.vx v24, v16, a0, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v8, v16, a4, v0.t
+; RV32-NEXT:    vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a5), zero
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 8, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v8, v16, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v16, a3, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
@@ -1444,45 +1449,46 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    addi a2, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a0
+; RV32-NEXT:    vand.vx v0, v0, a2
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a2
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index 15f6ca600cb37..b95bc73936059 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -75,17 +75,17 @@ define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
 ; CHECK-NEXT:    slli a4, a2, 5
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub a4, a4, a3
-; CHECK-NEXT:    add a5, a1, a2
-; CHECK-NEXT:    vl8re32.v v16, (a5)
 ; CHECK-NEXT:    add a5, a1, a3
+; CHECK-NEXT:    vl8re32.v v16, (a5)
+; CHECK-NEXT:    add a5, a1, a2
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    vl8re32.v v24, (a5)
 ; CHECK-NEXT:    vl8re32.v v0, (a1)
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vs8r.v v16, (a2)
-; CHECK-NEXT:    vs8r.v v24, (a3)
+; CHECK-NEXT:    vs8r.v v24, (a2)
+; CHECK-NEXT:    vs8r.v v16, (a3)
 ; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    ret
@@ -245,59 +245,21 @@ define fastcc <vscale x 32 x i1> @ret_nxv32i1_param_nxv32i1_nxv32i1(<vscale x 32
 define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) {
 ; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vl8re32.v v8, (a2)
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re32.v v0, (a0)
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vl8re32.v v24, (a2)
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl8re32.v v8, (a0)
-; CHECK-NEXT:    vl8re32.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v0, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v24, v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v0, v8
-; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    vadd.vx v16, v8, a4
-; CHECK-NEXT:    vadd.vx v8, v24, a4
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vadd.vv v16, v16, v24
+; CHECK-NEXT:    vl8re32.v v24, (a2)
+; CHECK-NEXT:    vadd.vv v16, v16, v24
+; CHECK-NEXT:    vadd.vx v16, v16, a4
+; CHECK-NEXT:    vadd.vx v8, v8, a4
 ; CHECK-NEXT:    ret
   %r = add <vscale x 32 x i32> %x, %y
   %s = add <vscale x 32 x i32> %r, %z
@@ -325,19 +287,19 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv8r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a3, a0, a1
 ; RV32-NEXT:    vl8re32.v v24, (a3)
-; RV32-NEXT:    vl8re32.v v0, (a0)
-; RV32-NEXT:    addi a3, sp, 128
+; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v8, (a3)
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    li a3, 2
 ; RV32-NEXT:    vs8r.v v16, (a1)
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv8r.v v8, v0
+; RV32-NEXT:    li a3, 2
 ; RV32-NEXT:    vmv8r.v v16, v24
 ; RV32-NEXT:    call ext2
 ; RV32-NEXT:    addi sp, s0, -144
@@ -364,19 +326,19 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    andi sp, sp, -128
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv8r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    vl8re32.v v24, (a3)
-; RV64-NEXT:    vl8re32.v v0, (a0)
-; RV64-NEXT:    addi a3, sp, 128
+; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    add a1, a0, a1
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v8, (a3)
-; RV64-NEXT:    add a1, a3, a1
-; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    vs8r.v v16, (a1)
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv8r.v v8, v0
+; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    vmv8r.v v16, v24
 ; RV64-NEXT:    call ext2
 ; RV64-NEXT:    addi sp, s0, -144
@@ -410,33 +372,37 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    addi a1, sp, 128
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv8r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a2)
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 128
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a2, a2, a1
 ; RV32-NEXT:    add a3, a0, a1
-; RV32-NEXT:    vl8re32.v v0, (a2)
-; RV32-NEXT:    vl8re32.v v24, (a3)
-; RV32-NEXT:    vl8re32.v v16, (a0)
+; RV32-NEXT:    vl8re32.v v24, (a2)
+; RV32-NEXT:    vl8re32.v v16, (a3)
+; RV32-NEXT:    vl8re32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 128
-; RV32-NEXT:    vs8r.v v16, (a3)
+; RV32-NEXT:    vs8r.v v0, (a0)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 128
+; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    addi a2, sp, 128
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 128
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
@@ -445,16 +411,13 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    li a5, 42
-; RV32-NEXT:    vs8r.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 128
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv8r.v v16, v0
+; RV32-NEXT:    vmv8r.v v16, v24
 ; RV32-NEXT:    call ext3
 ; RV32-NEXT:    addi sp, s0, -144
 ; RV32-NEXT:    .cfi_def_cfa sp, 144
@@ -483,33 +446,37 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    addi a1, sp, 128
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv8r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a2)
 ; RV64-NEXT:    csrr a3, vlenb
 ; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 128
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a2, a2, a1
 ; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    vl8re32.v v0, (a2)
-; RV64-NEXT:    vl8re32.v v24, (a3)
-; RV64-NEXT:    vl8re32.v v16, (a0)
+; RV64-NEXT:    vl8re32.v v24, (a2)
+; RV64-NEXT:    vl8re32.v v16, (a3)
+; RV64-NEXT:    vl8re32.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 5
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 128
-; RV64-NEXT:    vs8r.v v16, (a3)
+; RV64-NEXT:    vs8r.v v0, (a0)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 128
+; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addi a2, sp, 128
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a3, sp, 128
+; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    add a0, sp, a0
@@ -518,16 +485,13 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    slli a2, a2, 4
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    li a5, 42
-; RV64-NEXT:    vs8r.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 128
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv8r.v v16, v0
+; RV64-NEXT:    vmv8r.v v16, v24
 ; RV64-NEXT:    call ext3
 ; RV64-NEXT:    addi sp, s0, -144
 ; RV64-NEXT:    .cfi_def_cfa sp, 144
@@ -551,11 +515,11 @@ define fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, t5, a0
-; CHECK-NEXT:    vl8re32.v v24, (t5)
-; CHECK-NEXT:    vl8re32.v v0, (a0)
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vl8re32.v v0, (t5)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v24
-; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vadd.vv v8, v8, v0
+; CHECK-NEXT:    vadd.vv v16, v16, v24
 ; CHECK-NEXT:    ret
   %s = add <vscale x 32 x i32> %x, %z
   ret <vscale x 32 x i32> %s
@@ -608,8 +572,8 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV32-NEXT:    add t3, s1, t3
 ; RV32-NEXT:    addi t3, t3, 128
 ; RV32-NEXT:    vs8r.v v8, (t0)
-; RV32-NEXT:    addi t5, s1, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    addi t5, s1, 128
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call vector_arg_indirect_stack
@@ -671,8 +635,8 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV64-NEXT:    add t3, s1, t3
 ; RV64-NEXT:    addi t3, t3, 128
 ; RV64-NEXT:    vs8r.v v8, (t0)
-; RV64-NEXT:    addi t5, s1, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    addi t5, s1, 128
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call vector_arg_indirect_stack
@@ -746,9 +710,9 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV32-NEXT:    li t3, 8
 ; RV32-NEXT:    li t4, 9
 ; RV32-NEXT:    li t5, 10
-; RV32-NEXT:    li t6, 11
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    li t6, 11
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call vector_arg_indirect_stack_no_gpr
@@ -796,9 +760,9 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV64-NEXT:    li t3, 8
 ; RV64-NEXT:    li t4, 9
 ; RV64-NEXT:    li t5, 10
-; RV64-NEXT:    li t6, 11
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    sd a0, 0(sp)
+; RV64-NEXT:    li t6, 11
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call vector_arg_indirect_stack_no_gpr
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 2e181e0914c88..469b4a9ec3fd1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -9,11 +9,11 @@ define <vscale x 32 x i32> @callee_scalable_vector_split_indirect(<vscale x 32 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vl8re32.v v0, (a1)
+; CHECK-NEXT:    vl8re32.v v24, (a1)
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v24
-; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vadd.vv v8, v8, v0
+; CHECK-NEXT:    vadd.vv v16, v16, v24
 ; CHECK-NEXT:    ret
   %a = add <vscale x 32 x i32> %x, %y
   ret <vscale x 32 x i32> %a
@@ -41,9 +41,9 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vs8r.v v16, (a0)
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call callee_scalable_vector_split_indirect
 ; RV32-NEXT:    addi sp, s0, -144
@@ -76,9 +76,9 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vs8r.v v16, (a0)
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call callee_scalable_vector_split_indirect
 ; RV64-NEXT:    addi sp, s0, -144
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 1b9c78a20ec3b..34c2c70aab25e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -22,12 +22,12 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vs
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -49,11 +49,11 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -76,12 +76,12 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -103,11 +103,11 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vs
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -157,11 +157,11 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -184,12 +184,12 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vs
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -211,11 +211,11 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -238,12 +238,12 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va,
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -265,11 +265,11 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bflo
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 3
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,11 +316,10 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 3
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -336,11 +336,11 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 3
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,11 +395,10 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 3
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -413,10 +413,10 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -439,13 +439,13 @@ declare <vscale x 1 x half> @llvm.vp.ceil.nxv1f16(<vscale x 1 x half>, <vscale x
 define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -461,12 +461,12 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -485,12 +485,12 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -503,11 +503,11 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -525,13 +525,13 @@ declare <vscale x 2 x half> @llvm.vp.ceil.nxv2f16(<vscale x 2 x half>, <vscale x
 define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -547,12 +547,12 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -571,12 +571,12 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -589,11 +589,11 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -611,13 +611,13 @@ declare <vscale x 4 x half> @llvm.vp.ceil.nxv4f16(<vscale x 4 x half>, <vscale x
 define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -633,12 +633,12 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -657,12 +657,12 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -675,11 +675,11 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -699,12 +699,12 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -721,12 +721,12 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -745,12 +745,12 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -763,11 +763,11 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -787,12 +787,12 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -809,12 +809,12 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -833,12 +833,12 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -851,11 +851,11 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -875,12 +875,12 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 3
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,11 +921,10 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -941,11 +941,11 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -970,12 +970,12 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 3
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,11 +1015,10 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -1033,10 +1033,10 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -1064,9 +1064,9 @@ define <vscale x 1 x float> @vp_ceil_vv_nxv1f32(<vscale x 1 x float> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1085,8 +1085,8 @@ define <vscale x 1 x float> @vp_ceil_vv_nxv1f32_unmasked(<vscale x 1 x float> %v
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1106,9 +1106,9 @@ define <vscale x 2 x float> @vp_ceil_vv_nxv2f32(<vscale x 2 x float> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1127,8 +1127,8 @@ define <vscale x 2 x float> @vp_ceil_vv_nxv2f32_unmasked(<vscale x 2 x float> %v
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1149,9 +1149,9 @@ define <vscale x 4 x float> @vp_ceil_vv_nxv4f32(<vscale x 4 x float> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1171,8 +1171,8 @@ define <vscale x 4 x float> @vp_ceil_vv_nxv4f32_unmasked(<vscale x 4 x float> %v
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1193,9 +1193,9 @@ define <vscale x 8 x float> @vp_ceil_vv_nxv8f32(<vscale x 8 x float> %va, <vscal
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1215,8 +1215,8 @@ define <vscale x 8 x float> @vp_ceil_vv_nxv8f32_unmasked(<vscale x 8 x float> %v
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1237,9 +1237,9 @@ define <vscale x 16 x float> @vp_ceil_vv_nxv16f32(<vscale x 16 x float> %va, <vs
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1259,8 +1259,8 @@ define <vscale x 16 x float> @vp_ceil_vv_nxv16f32_unmasked(<vscale x 16 x float>
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1276,13 +1276,13 @@ declare <vscale x 1 x double> @llvm.vp.ceil.nxv1f64(<vscale x 1 x double>, <vsca
 define <vscale x 1 x double> @vp_ceil_vv_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1297,12 +1297,12 @@ define <vscale x 1 x double> @vp_ceil_vv_nxv1f64(<vscale x 1 x double> %va, <vsc
 define <vscale x 1 x double> @vp_ceil_vv_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1320,12 +1320,12 @@ define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1341,12 +1341,12 @@ define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vsc
 define <vscale x 2 x double> @vp_ceil_vv_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1364,12 +1364,12 @@ define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1385,12 +1385,12 @@ define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vsc
 define <vscale x 4 x double> @vp_ceil_vv_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1408,12 +1408,12 @@ define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1429,12 +1429,12 @@ define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vsc
 define <vscale x 7 x double> @vp_ceil_vv_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1452,12 +1452,12 @@ define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1473,12 +1473,12 @@ define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vsc
 define <vscale x 8 x double> @vp_ceil_vv_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1498,59 +1498,66 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    fsrmi a3, 3
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1570,12 +1577,12 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    fsrmi a3, 3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a2, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
@@ -1585,8 +1592,8 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index 482cf83d540c4..ed434deea1a83 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -10,19 +10,19 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV32-NEXT:    th.lwd a2, a3, (a0), 0, 3
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a2)
+; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vzext.vf4 v12, v8
 ; RV32-NEXT:    vmv.s.x v8, zero
-; RV32-NEXT:    vredsum.vs v9, v12, v8
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vredsum.vs v10, v12, v8
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    th.swia a0, (a1), 4, 0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT:    vle8.v v9, (a3)
-; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vle8.v v10, (a3)
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vi v9, v10, 4
+; RV32-NEXT:    vslideup.vi v10, v9, 4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf4 v12, v9
+; RV32-NEXT:    vzext.vf4 v12, v10
 ; RV32-NEXT:    vredsum.vs v8, v12, v8
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a1)
@@ -33,19 +33,19 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV64-NEXT:    th.ldd a2, a3, (a0), 0, 4
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a2)
+; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vzext.vf4 v12, v8
 ; RV64-NEXT:    vmv.s.x v8, zero
-; RV64-NEXT:    vredsum.vs v9, v12, v8
-; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    vredsum.vs v10, v12, v8
+; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    th.swia a0, (a1), 4, 0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT:    vle8.v v9, (a3)
-; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vle8.v v10, (a3)
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vi v9, v10, 4
+; RV64-NEXT:    vslideup.vi v10, v9, 4
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf4 v12, v9
+; RV64-NEXT:    vzext.vf4 v12, v10
 ; RV64-NEXT:    vredsum.vs v8, v12, v8
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
index 1343b64b876dc..969c60ba4fc91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
@@ -48,13 +48,13 @@ define void @v4xi8_concat_vector_insert_idx2(ptr %a, ptr %b, i8 %x) {
 ; CHECK-LABEL: v4xi8_concat_vector_insert_idx2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a1)
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a2
+; CHECK-NEXT:    vmv.s.x v9, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse8.v v9, (a0)
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v1 = load <2 x i8>, ptr %a
   %v2 = load <2 x i8>, ptr %b
@@ -68,13 +68,13 @@ define void @v4xi8_concat_vector_insert_idx3(ptr %a, ptr %b, i8 %x) {
 ; CHECK-LABEL: v4xi8_concat_vector_insert_idx3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a1)
 ; CHECK-NEXT:    vmv.s.x v10, a2
-; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vslideup.vi v9, v10, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse8.v v9, (a0)
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v1 = load <2 x i8>, ptr %a
   %v2 = load <2 x i8>, ptr %b
@@ -156,26 +156,26 @@ define void @v4xi64_concat_vector_insert_idx2(ptr %a, ptr %b, i64 %x) {
 ; RV32-LABEL: v4xi64_concat_vector_insert_idx2:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a1)
-; RV32-NEXT:    vle64.v v10, (a0)
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    vle64.v v10, (a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vslideup.vi v10, v8, 2
-; RV32-NEXT:    vse64.v v10, (a0)
+; RV32-NEXT:    vslideup.vi v8, v10, 2
+; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: v4xi64_concat_vector_insert_idx2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a1)
-; RV64-NEXT:    vle64.v v10, (a0)
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vle64.v v10, (a1)
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v8, a2
+; RV64-NEXT:    vmv.s.x v10, a2
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vslideup.vi v10, v8, 2
-; RV64-NEXT:    vse64.v v10, (a0)
+; RV64-NEXT:    vslideup.vi v8, v10, 2
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %v1 = load <2 x i64>, ptr %a
   %v2 = load <2 x i64>, ptr %b
@@ -189,28 +189,28 @@ define void @v4xi64_concat_vector_insert_idx3(ptr %a, ptr %b, i64 %x) {
 ; RV32-LABEL: v4xi64_concat_vector_insert_idx3:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vle64.v v8, (a1)
-; RV32-NEXT:    vle64.v v10, (a0)
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    vle64.v v10, (a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV32-NEXT:    vslide1down.vx v9, v8, a2
 ; RV32-NEXT:    vslide1down.vx v9, v9, a3
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v9, 1
+; RV32-NEXT:    vslideup.vi v10, v9, 1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vslideup.vi v10, v8, 2
-; RV32-NEXT:    vse64.v v10, (a0)
+; RV32-NEXT:    vslideup.vi v8, v10, 2
+; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: v4xi64_concat_vector_insert_idx3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a1)
-; RV64-NEXT:    vle64.v v10, (a0)
+; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vle64.v v10, (a1)
 ; RV64-NEXT:    vmv.s.x v9, a2
-; RV64-NEXT:    vslideup.vi v8, v9, 1
+; RV64-NEXT:    vslideup.vi v10, v9, 1
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vslideup.vi v10, v8, 2
-; RV64-NEXT:    vse64.v v10, (a0)
+; RV64-NEXT:    vslideup.vi v8, v10, 2
+; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %v1 = load <2 x i64>, ptr %a
   %v2 = load <2 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index f6c26bbba89fe..d470b8b9bff18 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -31,13 +31,12 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v11, 10
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    vrgather.vi v10, v9, 0
-; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
+; RV32-NEXT:    vrgather.vi v9, v8, 0
+; RV32-NEXT:    vmsne.vi v0, v9, 0
 ; RV32-NEXT:    vse32.v v11, (a0), v0.t
 ; RV32-NEXT:    ret
 ;
@@ -56,13 +55,13 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 10
-; RV64-NEXT:    vmv1r.v v0, v12
-; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    vrgather.vi v11, v9, 0
-; RV64-NEXT:    vmsne.vi v0, v11, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmerge.vim v8, v9, 1, v0
+; RV64-NEXT:    vrgather.vi v9, v8, 0
+; RV64-NEXT:    vmsne.vi v0, v9, 0
 ; RV64-NEXT:    vse32.v v10, (a0), v0.t
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
index a9da6c305aac3..663716f97deee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
@@ -6,9 +6,9 @@
   ; CHECK-LABEL: foo:
   ; CHECK:       # %bb.0: # %entry
   ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+  ; CHECK-NEXT:    vsll.vi v9, v8, 5
   ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-  ; CHECK-NEXT:    vsll.vi v8, v8, 5
-  ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+  ; CHECK-NEXT:    vmerge.vim v8, v9, -1, v0
   ; CHECK-NEXT:    sf.vc.v.x 3, 31, v9, a1
   ; CHECK-NEXT:    bgeu a0, zero, .LBB0_3
   ; CHECK-NEXT:  # %bb.1: # %entry
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 208735b18cbab..024e976d8880c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -809,12 +809,12 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv1i32:
@@ -881,12 +881,12 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv2i32:
@@ -953,12 +953,12 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv4i32:
@@ -1025,12 +1025,12 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv8i32:
@@ -1097,12 +1097,12 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv16i32:
@@ -1110,12 +1110,12 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 158
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 32
-; CHECK-D-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 158
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 32
+; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_nxv16i32:
@@ -1232,16 +1232,16 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv1i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    fsrmi a1, 1
-; CHECK-F-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
-; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v9, a0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vwsubu.vv v10, v9, v8
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vminu.vx v8, v10, a0
-; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv1i64:
@@ -1249,13 +1249,13 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_nxv1i64:
@@ -1372,16 +1372,16 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv2i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    fsrmi a1, 1
-; CHECK-F-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
-; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v10, a0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v11, v8
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v11, 23
+; CHECK-F-NEXT:    vwsubu.vv v12, v10, v8
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v10, a0
-; CHECK-F-NEXT:    fsrm a1
+; CHECK-F-NEXT:    vminu.vx v8, v12, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv2i64:
@@ -1389,13 +1389,13 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_nxv2i64:
@@ -1512,16 +1512,16 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv4i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    fsrmi a1, 1
-; CHECK-F-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
-; CHECK-F-NEXT:    vwsubu.vv v12, v8, v10
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v12, a0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v14, v8
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v14, 23
+; CHECK-F-NEXT:    vwsubu.vv v16, v12, v8
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v12, a0
-; CHECK-F-NEXT:    fsrm a1
+; CHECK-F-NEXT:    vminu.vx v8, v16, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv4i64:
@@ -1529,13 +1529,13 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_nxv4i64:
@@ -1652,16 +1652,16 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv8i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    fsrmi a1, 1
-; CHECK-F-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vmv.v.x v8, a0
-; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
-; CHECK-F-NEXT:    vwsubu.vv v16, v8, v12
+; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v16, a0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v20, v8
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v20, 23
+; CHECK-F-NEXT:    vwsubu.vv v24, v16, v8
 ; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v16, a0
-; CHECK-F-NEXT:    fsrm a1
+; CHECK-F-NEXT:    vminu.vx v8, v24, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv8i64:
@@ -1669,13 +1669,13 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_nxv8i64:
@@ -2436,10 +2436,10 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32:
@@ -2503,10 +2503,10 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32:
@@ -2570,10 +2570,10 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32:
@@ -2637,10 +2637,10 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32:
@@ -2704,10 +2704,10 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 158
-; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32:
@@ -2715,10 +2715,10 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-D-NEXT:    li a1, 158
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    li a0, 158
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i32:
@@ -2838,9 +2838,9 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-NEXT:    vmv.v.x v9, a0
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
 ; CHECK-F-NEXT:    vwsubu.vv v8, v9, v10
-; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
@@ -2848,11 +2848,11 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i64:
@@ -2972,9 +2972,9 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-NEXT:    vmv.v.x v10, a0
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v11, v8
+; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    vsrl.vi v11, v11, 23
 ; CHECK-F-NEXT:    vwsubu.vv v8, v10, v11
-; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
@@ -2982,11 +2982,11 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i64:
@@ -3106,9 +3106,9 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-NEXT:    vmv.v.x v12, a0
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v14, v8
+; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    vsrl.vi v14, v14, 23
 ; CHECK-F-NEXT:    vwsubu.vv v8, v12, v14
-; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
@@ -3116,11 +3116,11 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i64:
@@ -3240,9 +3240,9 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-NEXT:    vmv.v.x v16, a0
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v20, v8
+; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    vsrl.vi v20, v20, 23
 ; CHECK-F-NEXT:    vwsubu.vv v8, v16, v20
-; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64:
@@ -3250,11 +3250,11 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    li a1, 52
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1086
-; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 6f515996677ee..39582ee3dacae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -940,12 +940,12 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 158
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32:
@@ -963,12 +963,12 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 158
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32_unmasked:
@@ -988,13 +988,13 @@ define <vscale x 1 x i64> @vp_ctlz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64:
@@ -1012,13 +1012,13 @@ define <vscale x 1 x i64> @vp_ctlz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64_unmasked:
@@ -1038,13 +1038,13 @@ define <vscale x 2 x i64> @vp_ctlz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64:
@@ -1062,13 +1062,13 @@ define <vscale x 2 x i64> @vp_ctlz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64_unmasked:
@@ -1088,13 +1088,13 @@ define <vscale x 4 x i64> @vp_ctlz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64:
@@ -1112,13 +1112,13 @@ define <vscale x 4 x i64> @vp_ctlz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64_unmasked:
@@ -1138,13 +1138,13 @@ define <vscale x 7 x i64> @vp_ctlz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64:
@@ -1162,13 +1162,13 @@ define <vscale x 7 x i64> @vp_ctlz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64_unmasked:
@@ -1188,13 +1188,13 @@ define <vscale x 8 x i64> @vp_ctlz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64:
@@ -1212,13 +1212,13 @@ define <vscale x 8 x i64> @vp_ctlz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64_unmasked:
@@ -1258,14 +1258,14 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    vminu.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64:
@@ -1318,10 +1318,10 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3
 ; CHECK-NEXT:    vminu.vx v8, v8, a4
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64_unmasked:
@@ -2201,10 +2201,10 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 158
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32:
@@ -2222,10 +2222,10 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a0, 158
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32_unmasked:
@@ -2244,11 +2244,11 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64:
@@ -2266,11 +2266,11 @@ define <vscale x 1 x i64> @vp_ctlz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i64_unmasked:
@@ -2289,11 +2289,11 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64:
@@ -2311,11 +2311,11 @@ define <vscale x 2 x i64> @vp_ctlz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv2i64_unmasked:
@@ -2334,11 +2334,11 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64:
@@ -2356,11 +2356,11 @@ define <vscale x 4 x i64> @vp_ctlz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv4i64_unmasked:
@@ -2379,11 +2379,11 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64:
@@ -2401,11 +2401,11 @@ define <vscale x 7 x i64> @vp_ctlz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv7i64_unmasked:
@@ -2424,11 +2424,11 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64:
@@ -2446,11 +2446,11 @@ define <vscale x 8 x i64> @vp_ctlz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 1086
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv8i64_unmasked:
@@ -2486,13 +2486,13 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB94_2:
-; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64:
@@ -2543,9 +2543,9 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2
 ; CHECK-NEXT:    vrsub.vx v8, v8, a3
-; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index 5761ae0926eae..262b26613df7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -42,9 +42,9 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i8:
@@ -59,9 +59,9 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i8:
@@ -108,9 +108,9 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i8:
@@ -125,9 +125,9 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i8:
@@ -174,9 +174,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i8:
@@ -191,9 +191,9 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i8:
@@ -240,9 +240,9 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v9, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i8:
@@ -257,9 +257,9 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v9, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i8:
@@ -306,9 +306,9 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-F-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vsub.vx v8, v10, a0
-; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    vmerge.vim v8, v10, 8, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv16i8:
@@ -323,9 +323,9 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v10, a0
-; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-D-NEXT:    vmerge.vim v8, v10, 8, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv16i8:
@@ -811,15 +811,15 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i32:
@@ -882,15 +882,15 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i32:
@@ -953,15 +953,15 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v10, v8, v10
-; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v10
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v10, v10
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i32:
@@ -1024,15 +1024,15 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v12, v8, v12
-; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v12
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v12, v12
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i32:
@@ -1095,15 +1095,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v16, v8, v16
-; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v16
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
-; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    li a0, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv16i32:
@@ -1111,15 +1111,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 127
 ; CHECK-D-NEXT:    vand.vv v16, v8, v16
-; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v16
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 32
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv16i32:
@@ -1218,17 +1218,19 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v9
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v9
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v9, v8, a1
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vwsubu.vx v9, v8, a0
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i64:
@@ -1236,16 +1238,16 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v9
-; CHECK-D-NEXT:    vsrl.vx v9, v9, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v9, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v9, v9, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i64:
@@ -1344,17 +1346,19 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v10, v8, v10
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v10
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v10
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v10, v8, a1
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vwsubu.vx v10, v8, a0
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i64:
@@ -1362,16 +1366,16 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v10
-; CHECK-D-NEXT:    vsrl.vx v10, v10, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v10, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v10, v10, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i64:
@@ -1470,17 +1474,19 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v12, v8, v12
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v12
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v12
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v12, v8, a1
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vwsubu.vx v12, v8, a0
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i64:
@@ -1488,16 +1494,16 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v12, v8, v12
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v12
-; CHECK-D-NEXT:    vsrl.vx v12, v12, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v12, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v12, v12, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i64:
@@ -1596,17 +1602,19 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    li a1, 127
 ; CHECK-F-NEXT:    vand.vv v16, v8, v16
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v16
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v16
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    vwsubu.vx v16, v8, a1
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vsrl.vi v8, v24, 23
+; CHECK-F-NEXT:    vwsubu.vx v16, v8, a0
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i64:
@@ -1614,16 +1622,16 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-D-NEXT:    vsrl.vx v16, v16, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-D-NEXT:    vsub.vx v8, v16, a1
-; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v16, v16, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v16, a0
+; CHECK-D-NEXT:    li a0, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i64:
@@ -2378,10 +2386,10 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv1i32:
@@ -2442,10 +2450,10 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv2i32:
@@ -2506,10 +2514,10 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i32:
@@ -2570,10 +2578,10 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i32:
@@ -2634,10 +2642,10 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv16i32:
@@ -2647,10 +2655,10 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-D-NEXT:    li a1, 127
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    li a0, 127
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i32:
@@ -2751,10 +2759,10 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v8, v9, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vwsubu.vx v8, v9, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv1i64:
@@ -2762,13 +2770,13 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i64:
@@ -2869,10 +2877,10 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v8, v10, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vwsubu.vx v8, v10, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv2i64:
@@ -2880,13 +2888,13 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i64:
@@ -2987,10 +2995,10 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v8, v12, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vwsubu.vx v8, v12, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i64:
@@ -2998,13 +3006,13 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v12
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i64:
@@ -3105,10 +3113,10 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
-; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v8, v16, a1
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vwsubu.vx v8, v16, a0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i64:
@@ -3116,13 +3124,13 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    li a1, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    li a0, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-D-NEXT:    li a0, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 766717d92a749..60ea1881ed213 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -3708,12 +3708,12 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32:
@@ -3733,10 +3733,10 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 23
-; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32_unmasked:
@@ -3755,13 +3755,13 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i64:
@@ -3779,13 +3779,13 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv1i64_unmasked:
@@ -3804,13 +3804,13 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i64:
@@ -3828,13 +3828,13 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv2i64_unmasked:
@@ -3853,13 +3853,13 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i64:
@@ -3877,13 +3877,13 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv4i64_unmasked:
@@ -3902,13 +3902,13 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv7i64:
@@ -3926,13 +3926,13 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv7i64_unmasked:
@@ -3951,13 +3951,13 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i64:
@@ -3975,13 +3975,13 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    li a1, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vx v8, v8, a1
-; CHECK-NEXT:    li a1, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    li a0, 52
+; CHECK-NEXT:    vsrl.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1023
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv8i64_unmasked:
@@ -4042,9 +4042,9 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -4104,12 +4104,12 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:  .LBB95_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v24
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v24
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
+; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2
 ; CHECK-NEXT:    vsub.vx v8, v8, a3
-; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
diff --git a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
index 31fa5d025156f..05defa160c971 100644
--- a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
@@ -406,33 +406,33 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; NO-SINK-LABEL: sink_splat_fadd_scalable:
 ; NO-SINK:       # %bb.0: # %entry
-; NO-SINK-NEXT:    csrr a1, vlenb
-; NO-SINK-NEXT:    srli a3, a1, 2
-; NO-SINK-NEXT:    li a2, 1024
-; NO-SINK-NEXT:    bgeu a2, a3, .LBB4_2
+; NO-SINK-NEXT:    csrr a2, vlenb
+; NO-SINK-NEXT:    srli a3, a2, 2
+; NO-SINK-NEXT:    li a1, 1024
+; NO-SINK-NEXT:    bgeu a1, a3, .LBB4_2
 ; NO-SINK-NEXT:  # %bb.1:
-; NO-SINK-NEXT:    li a2, 0
+; NO-SINK-NEXT:    li a1, 0
 ; NO-SINK-NEXT:    j .LBB4_5
 ; NO-SINK-NEXT:  .LBB4_2: # %vector.ph
-; NO-SINK-NEXT:    addi a2, a3, -1
-; NO-SINK-NEXT:    andi a4, a2, 1024
-; NO-SINK-NEXT:    xori a2, a4, 1024
+; NO-SINK-NEXT:    addi a1, a3, -1
+; NO-SINK-NEXT:    andi a4, a1, 1024
+; NO-SINK-NEXT:    xori a1, a4, 1024
 ; NO-SINK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; NO-SINK-NEXT:    vfmv.v.f v8, fa0
 ; NO-SINK-NEXT:    mv a5, a0
-; NO-SINK-NEXT:    mv a6, a2
+; NO-SINK-NEXT:    mv a6, a1
 ; NO-SINK-NEXT:  .LBB4_3: # %vector.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NO-SINK-NEXT:    vl1re32.v v9, (a5)
 ; NO-SINK-NEXT:    sub a6, a6, a3
 ; NO-SINK-NEXT:    vfadd.vv v9, v9, v8
 ; NO-SINK-NEXT:    vs1r.v v9, (a5)
-; NO-SINK-NEXT:    add a5, a5, a1
+; NO-SINK-NEXT:    add a5, a5, a2
 ; NO-SINK-NEXT:    bnez a6, .LBB4_3
 ; NO-SINK-NEXT:  # %bb.4: # %middle.block
 ; NO-SINK-NEXT:    beqz a4, .LBB4_7
 ; NO-SINK-NEXT:  .LBB4_5: # %for.body.preheader
-; NO-SINK-NEXT:    slli a1, a2, 2
+; NO-SINK-NEXT:    slli a1, a1, 2
 ; NO-SINK-NEXT:    lui a2, 1
 ; NO-SINK-NEXT:    add a1, a0, a1
 ; NO-SINK-NEXT:    add a0, a0, a2
@@ -448,19 +448,19 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ;
 ; SINK-LABEL: sink_splat_fadd_scalable:
 ; SINK:       # %bb.0: # %entry
-; SINK-NEXT:    csrr a1, vlenb
-; SINK-NEXT:    srli a3, a1, 2
-; SINK-NEXT:    li a2, 1024
-; SINK-NEXT:    bgeu a2, a3, .LBB4_2
+; SINK-NEXT:    csrr a2, vlenb
+; SINK-NEXT:    srli a3, a2, 2
+; SINK-NEXT:    li a1, 1024
+; SINK-NEXT:    bgeu a1, a3, .LBB4_2
 ; SINK-NEXT:  # %bb.1:
-; SINK-NEXT:    li a2, 0
+; SINK-NEXT:    li a1, 0
 ; SINK-NEXT:    j .LBB4_5
 ; SINK-NEXT:  .LBB4_2: # %vector.ph
-; SINK-NEXT:    addi a2, a3, -1
-; SINK-NEXT:    andi a4, a2, 1024
-; SINK-NEXT:    xori a2, a4, 1024
+; SINK-NEXT:    addi a1, a3, -1
+; SINK-NEXT:    andi a4, a1, 1024
+; SINK-NEXT:    xori a1, a4, 1024
 ; SINK-NEXT:    mv a5, a0
-; SINK-NEXT:    mv a6, a2
+; SINK-NEXT:    mv a6, a1
 ; SINK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; SINK-NEXT:  .LBB4_3: # %vector.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -468,12 +468,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; SINK-NEXT:    sub a6, a6, a3
 ; SINK-NEXT:    vfadd.vf v8, v8, fa0
 ; SINK-NEXT:    vs1r.v v8, (a5)
-; SINK-NEXT:    add a5, a5, a1
+; SINK-NEXT:    add a5, a5, a2
 ; SINK-NEXT:    bnez a6, .LBB4_3
 ; SINK-NEXT:  # %bb.4: # %middle.block
 ; SINK-NEXT:    beqz a4, .LBB4_7
 ; SINK-NEXT:  .LBB4_5: # %for.body.preheader
-; SINK-NEXT:    slli a1, a2, 2
+; SINK-NEXT:    slli a1, a1, 2
 ; SINK-NEXT:    lui a2, 1
 ; SINK-NEXT:    add a1, a0, a1
 ; SINK-NEXT:    add a0, a0, a2
@@ -489,19 +489,19 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ;
 ; DEFAULT-LABEL: sink_splat_fadd_scalable:
 ; DEFAULT:       # %bb.0: # %entry
-; DEFAULT-NEXT:    csrr a1, vlenb
-; DEFAULT-NEXT:    srli a3, a1, 2
-; DEFAULT-NEXT:    li a2, 1024
-; DEFAULT-NEXT:    bgeu a2, a3, .LBB4_2
+; DEFAULT-NEXT:    csrr a2, vlenb
+; DEFAULT-NEXT:    srli a3, a2, 2
+; DEFAULT-NEXT:    li a1, 1024
+; DEFAULT-NEXT:    bgeu a1, a3, .LBB4_2
 ; DEFAULT-NEXT:  # %bb.1:
-; DEFAULT-NEXT:    li a2, 0
+; DEFAULT-NEXT:    li a1, 0
 ; DEFAULT-NEXT:    j .LBB4_5
 ; DEFAULT-NEXT:  .LBB4_2: # %vector.ph
-; DEFAULT-NEXT:    addi a2, a3, -1
-; DEFAULT-NEXT:    andi a4, a2, 1024
-; DEFAULT-NEXT:    xori a2, a4, 1024
+; DEFAULT-NEXT:    addi a1, a3, -1
+; DEFAULT-NEXT:    andi a4, a1, 1024
+; DEFAULT-NEXT:    xori a1, a4, 1024
 ; DEFAULT-NEXT:    mv a5, a0
-; DEFAULT-NEXT:    mv a6, a2
+; DEFAULT-NEXT:    mv a6, a1
 ; DEFAULT-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; DEFAULT-NEXT:  .LBB4_3: # %vector.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -509,12 +509,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; DEFAULT-NEXT:    sub a6, a6, a3
 ; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0
 ; DEFAULT-NEXT:    vs1r.v v8, (a5)
-; DEFAULT-NEXT:    add a5, a5, a1
+; DEFAULT-NEXT:    add a5, a5, a2
 ; DEFAULT-NEXT:    bnez a6, .LBB4_3
 ; DEFAULT-NEXT:  # %bb.4: # %middle.block
 ; DEFAULT-NEXT:    beqz a4, .LBB4_7
 ; DEFAULT-NEXT:  .LBB4_5: # %for.body.preheader
-; DEFAULT-NEXT:    slli a1, a2, 2
+; DEFAULT-NEXT:    slli a1, a1, 2
 ; DEFAULT-NEXT:    lui a2, 1
 ; DEFAULT-NEXT:    add a1, a0, a1
 ; DEFAULT-NEXT:    add a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
index 8c63c2d4be8c1..ec8580e0b6f12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
@@ -497,12 +497,12 @@ declare <vscale x 1 x double> @llvm.ceil.nxv1f64(<vscale x 1 x double>)
 define <vscale x 1 x i8> @ceil_nxv1f64_to_si8(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_si8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI16_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI16_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -518,12 +518,12 @@ define <vscale x 1 x i8> @ceil_nxv1f64_to_si8(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_si8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI16_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI16_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -544,12 +544,12 @@ define <vscale x 1 x i8> @ceil_nxv1f64_to_si8(<vscale x 1 x double> %x) {
 define <vscale x 1 x i8> @ceil_nxv1f64_to_ui8(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_ui8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI17_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI17_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -565,12 +565,12 @@ define <vscale x 1 x i8> @ceil_nxv1f64_to_ui8(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_ui8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI17_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI17_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -591,12 +591,12 @@ define <vscale x 1 x i8> @ceil_nxv1f64_to_ui8(<vscale x 1 x double> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f64_to_si16(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI18_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI18_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -610,12 +610,12 @@ define <vscale x 1 x i16> @ceil_nxv1f64_to_si16(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -634,12 +634,12 @@ define <vscale x 1 x i16> @ceil_nxv1f64_to_si16(<vscale x 1 x double> %x) {
 define <vscale x 1 x i16> @ceil_nxv1f64_to_ui16(<vscale x 1 x double> %x) {
 ; RV32-LABEL: ceil_nxv1f64_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI19_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -653,12 +653,12 @@ define <vscale x 1 x i16> @ceil_nxv1f64_to_ui16(<vscale x 1 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv1f64_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -771,12 +771,12 @@ declare <vscale x 4 x double> @llvm.ceil.nxv4f64(<vscale x 4 x double>)
 define <vscale x 4 x i8> @ceil_nxv4f64_to_si8(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_si8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI24_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI24_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v12, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -792,12 +792,12 @@ define <vscale x 4 x i8> @ceil_nxv4f64_to_si8(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_si8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI24_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI24_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v12, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -818,12 +818,12 @@ define <vscale x 4 x i8> @ceil_nxv4f64_to_si8(<vscale x 4 x double> %x) {
 define <vscale x 4 x i8> @ceil_nxv4f64_to_ui8(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_ui8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI25_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI25_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v12, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -839,12 +839,12 @@ define <vscale x 4 x i8> @ceil_nxv4f64_to_ui8(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_ui8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI25_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI25_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v12, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -865,12 +865,12 @@ define <vscale x 4 x i8> @ceil_nxv4f64_to_ui8(<vscale x 4 x double> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f64_to_si16(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI26_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v12, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -884,12 +884,12 @@ define <vscale x 4 x i16> @ceil_nxv4f64_to_si16(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI26_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI26_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI26_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v12, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -908,12 +908,12 @@ define <vscale x 4 x i16> @ceil_nxv4f64_to_si16(<vscale x 4 x double> %x) {
 define <vscale x 4 x i16> @ceil_nxv4f64_to_ui16(<vscale x 4 x double> %x) {
 ; RV32-LABEL: ceil_nxv4f64_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI27_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
+; RV32-NEXT:    lui a0, %hi(.LCPI27_0)
+; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a0)
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v12, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -927,12 +927,12 @@ define <vscale x 4 x i16> @ceil_nxv4f64_to_ui16(<vscale x 4 x double> %x) {
 ;
 ; RV64-LABEL: ceil_nxv4f64_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI27_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI27_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
+; RV64-NEXT:    lui a0, %hi(.LCPI27_0)
+; RV64-NEXT:    fld fa5, %lo(.LCPI27_0)(a0)
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v12, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index a35cf14203f78..51c70a32ccac8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -136,12 +136,12 @@ define <32 x i8> @test_expandload_v32i8(ptr %base, <32 x i1> %mask, <32 x i8> %p
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    viota.m v10, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    viota.m v12, v0
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v12, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> %mask, <32 x i8> %passthru)
   ret <32 x i8> %res
@@ -163,12 +163,12 @@ define <64 x i8> @test_expandload_v64i8(ptr %base, <64 x i1> %mask, <64 x i8> %p
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    viota.m v12, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    viota.m v16, v0
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> %mask, <64 x i8> %passthru)
   ret <64 x i8> %res
@@ -190,12 +190,12 @@ define <128 x i8> @test_expandload_v128i8(ptr %base, <128 x i1> %mask, <128 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    viota.m v16, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    viota.m v24, v0
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> %mask, <128 x i8> %passthru)
   ret <128 x i8> %res
@@ -218,106 +218,71 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    slli a2, a2, 5
+; CHECK-RV32-NEXT:    slli a2, a2, 4
 ; CHECK-RV32-NEXT:    sub sp, sp, a2
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    li a3, 24
-; CHECK-RV32-NEXT:    mul a2, a2, a3
+; CHECK-RV32-NEXT:    slli a2, a2, 3
 ; CHECK-RV32-NEXT:    add a2, sp, a2
 ; CHECK-RV32-NEXT:    addi a2, a2, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV32-NEXT:    li a2, 128
-; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV32-NEXT:    vslidedown.vi v6, v0, 1
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:    vmv.x.s a4, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-RV32-NEXT:    viota.m v16, v0
+; CHECK-RV32-NEXT:    addi a5, sp, 16
+; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vcpop.m a5, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-RV32-NEXT:    vle8.v v24, (a0)
+; CHECK-RV32-NEXT:    csrr a5, vlenb
+; CHECK-RV32-NEXT:    slli a5, a5, 3
+; CHECK-RV32-NEXT:    add a5, sp, a5
+; CHECK-RV32-NEXT:    addi a5, a5, 16
+; CHECK-RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    addi a5, sp, 16
+; CHECK-RV32-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
+; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV32-NEXT:    csrr a5, vlenb
+; CHECK-RV32-NEXT:    slli a5, a5, 3
+; CHECK-RV32-NEXT:    add a5, sp, a5
+; CHECK-RV32-NEXT:    addi a5, a5, 16
+; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vle8.v v16, (a1)
-; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 3
-; CHECK-RV32-NEXT:    add a1, sp, a1
-; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a3
+; CHECK-RV32-NEXT:    vsrl.vx v10, v6, a3
 ; CHECK-RV32-NEXT:    vsrl.vx v11, v0, a3
-; CHECK-RV32-NEXT:    vmv.x.s a1, v9
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a3, v0
-; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a5, v10
-; CHECK-RV32-NEXT:    vmv.x.s a6, v11
-; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vle8.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a3, vlenb
-; CHECK-RV32-NEXT:    slli a3, a3, 4
-; CHECK-RV32-NEXT:    add a3, sp, a3
-; CHECK-RV32-NEXT:    addi a3, a3, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vmv.x.s a1, v6
+; CHECK-RV32-NEXT:    cpop a3, a4
+; CHECK-RV32-NEXT:    vmv.x.s a4, v10
+; CHECK-RV32-NEXT:    vmv.x.s a5, v11
 ; CHECK-RV32-NEXT:    cpop a1, a1
-; CHECK-RV32-NEXT:    cpop a3, a6
 ; CHECK-RV32-NEXT:    cpop a5, a5
-; CHECK-RV32-NEXT:    add a3, a4, a3
-; CHECK-RV32-NEXT:    add a1, a1, a5
+; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    add a3, a3, a5
+; CHECK-RV32-NEXT:    add a1, a1, a4
 ; CHECK-RV32-NEXT:    add a1, a3, a1
 ; CHECK-RV32-NEXT:    add a0, a0, a1
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vle8.v v8, (a0)
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vle8.v v24, (a0)
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v24, v0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -329,38 +294,50 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV64-NEXT:    csrr a2, vlenb
-; CHECK-RV64-NEXT:    slli a2, a2, 5
-; CHECK-RV64-NEXT:    sub sp, sp, a2
-; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-RV64-NEXT:    csrr a2, vlenb
 ; CHECK-RV64-NEXT:    li a3, 24
 ; CHECK-RV64-NEXT:    mul a2, a2, a3
+; CHECK-RV64-NEXT:    sub sp, sp, a2
+; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-RV64-NEXT:    csrr a2, vlenb
+; CHECK-RV64-NEXT:    slli a2, a2, 4
 ; CHECK-RV64-NEXT:    add a2, sp, a2
 ; CHECK-RV64-NEXT:    addi a2, a2, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV64-NEXT:    vslidedown.vi v6, v0, 1
 ; CHECK-RV64-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV64-NEXT:    vle8.v v16, (a1)
-; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 3
-; CHECK-RV64-NEXT:    add a1, sp, a1
-; CHECK-RV64-NEXT:    addi a1, a1, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v9
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-RV64-NEXT:    viota.m v16, v0
+; CHECK-RV64-NEXT:    csrr a4, vlenb
+; CHECK-RV64-NEXT:    slli a4, a4, 3
+; CHECK-RV64-NEXT:    add a4, sp, a4
+; CHECK-RV64-NEXT:    addi a4, a4, 16
+; CHECK-RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vcpop.m a4, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
-; CHECK-RV64-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64-NEXT:    vle8.v v16, (a0)
+; CHECK-RV64-NEXT:    csrr a4, vlenb
+; CHECK-RV64-NEXT:    slli a4, a4, 3
+; CHECK-RV64-NEXT:    add a4, sp, a4
+; CHECK-RV64-NEXT:    addi a4, a4, 16
+; CHECK-RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    csrr a4, vlenb
+; CHECK-RV64-NEXT:    slli a4, a4, 4
+; CHECK-RV64-NEXT:    add a4, sp, a4
+; CHECK-RV64-NEXT:    addi a4, a4, 16
+; CHECK-RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
+; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    slli a4, a4, 4
 ; CHECK-RV64-NEXT:    add a4, sp, a4
 ; CHECK-RV64-NEXT:    addi a4, a4, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vle8.v v16, (a1)
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a1, v6
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a4, v7
 ; CHECK-RV64-NEXT:    cpop a3, a3
@@ -372,53 +349,29 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v24, v0
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    viota.m v24, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    addi a0, sp, 16
+; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vrgather.vv v16, v8, v24, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-NEXT:    add sp, sp, a0
 ; CHECK-RV64-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -608,13 +561,13 @@ define <32 x i16> @test_expandload_v32i16(ptr %base, <32 x i1> %mask, <32 x i16>
 ; CHECK-LABEL: test_expandload_v32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    viota.m v12, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v16, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    viota.m v16, v0
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = call <32 x i16> @llvm.masked.expandload.v32i16(ptr align 2 %base, <32 x i1> %mask, <32 x i16> %passthru)
   ret <32 x i16> %res
@@ -635,13 +588,13 @@ define <64 x i16> @test_expandload_v64i16(ptr %base, <64 x i1> %mask, <64 x i16>
 ; CHECK-LABEL: test_expandload_v64i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    viota.m v16, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vle16.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    viota.m v24, v0
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = call <64 x i16> @llvm.masked.expandload.v64i16(ptr align 2 %base, <64 x i1> %mask, <64 x i16> %passthru)
   ret <64 x i16> %res
@@ -664,76 +617,66 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
-; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    li a2, 24
 ; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    sub sp, sp, a1
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-RV32-NEXT:    csrr a1, vlenb
+; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 64
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    viota.m v16, v0
+; CHECK-RV32-NEXT:    vcpop.m a2, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    vle16.v v24, (a0)
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    csrr a2, vlenb
+; CHECK-RV32-NEXT:    slli a2, a2, 3
+; CHECK-RV32-NEXT:    add a2, sp, a2
+; CHECK-RV32-NEXT:    addi a2, a2, 16
+; CHECK-RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 8
+; CHECK-RV32-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a4, v0
+; CHECK-RV32-NEXT:    vcpop.m a4, v24
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a2
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v7
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v16, (a0)
-; CHECK-RV32-NEXT:    csrr a5, vlenb
-; CHECK-RV32-NEXT:    slli a5, a5, 4
-; CHECK-RV32-NEXT:    add a5, sp, a5
-; CHECK-RV32-NEXT:    addi a5, a5, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a4, v25
-; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    vsrl.vx v8, v0, a2
+; CHECK-RV32-NEXT:    cpop a2, a3
+; CHECK-RV32-NEXT:    vmv.x.s a3, v8
 ; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    add a3, a3, a4
-; CHECK-RV32-NEXT:    slli a3, a3, 1
-; CHECK-RV32-NEXT:    add a0, a0, a3
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v16, (a0)
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v16, v0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    add a2, a2, a3
+; CHECK-RV32-NEXT:    slli a2, a2, 1
+; CHECK-RV32-NEXT:    add a0, a0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    vle16.v v8, (a0)
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v8, v7
-; CHECK-RV32-NEXT:    vmv1r.v v0, v7
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV32-NEXT:    viota.m v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v0, v24
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -749,50 +692,58 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    sub sp, sp, a1
 ; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 3
+; CHECK-RV64-NEXT:    slli a1, a1, 4
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 64
-; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 8
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV64-NEXT:    viota.m v16, v0
+; CHECK-RV64-NEXT:    csrr a2, vlenb
+; CHECK-RV64-NEXT:    li a3, 24
+; CHECK-RV64-NEXT:    mul a2, a2, a3
+; CHECK-RV64-NEXT:    add a2, sp, a2
+; CHECK-RV64-NEXT:    addi a2, a2, 16
+; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vcpop.m a2, v0
-; CHECK-RV64-NEXT:    vcpop.m a3, v7
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle16.v v24, (a0)
-; CHECK-RV64-NEXT:    csrr a4, vlenb
-; CHECK-RV64-NEXT:    slli a4, a4, 4
-; CHECK-RV64-NEXT:    add a4, sp, a4
-; CHECK-RV64-NEXT:    addi a4, a4, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    csrr a3, vlenb
+; CHECK-RV64-NEXT:    li a4, 24
+; CHECK-RV64-NEXT:    mul a3, a3, a4
+; CHECK-RV64-NEXT:    add a3, sp, a3
+; CHECK-RV64-NEXT:    addi a3, a3, 16
+; CHECK-RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT:    addi a3, sp, 16
+; CHECK-RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v0
 ; CHECK-RV64-NEXT:    slli a2, a2, 1
 ; CHECK-RV64-NEXT:    add a0, a0, a2
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
-; CHECK-RV64-NEXT:    vle16.v v24, (a0)
+; CHECK-RV64-NEXT:    vle16.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a2, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a2
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v24, v0
+; CHECK-RV64-NEXT:    viota.m v16, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
+; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    vmv1r.v v0, v7
+; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
@@ -803,11 +754,6 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
@@ -990,13 +936,13 @@ define <32 x i32> @test_expandload_v32i32(ptr %base, <32 x i1> %mask, <32 x i32>
 ; CHECK-LABEL: test_expandload_v32i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    viota.m v16, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
+; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    viota.m v24, v0
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = call <32 x i32> @llvm.masked.expandload.v32i32(ptr align 4 %base, <32 x i1> %mask, <32 x i32> %passthru)
   ret <32 x i32> %res
@@ -1023,50 +969,58 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
 ; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 3
+; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 32
-; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 4
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    viota.m v16, v0
+; CHECK-RV32-NEXT:    csrr a2, vlenb
+; CHECK-RV32-NEXT:    li a3, 24
+; CHECK-RV32-NEXT:    mul a2, a2, a3
+; CHECK-RV32-NEXT:    add a2, sp, a2
+; CHECK-RV32-NEXT:    addi a2, a2, 16
+; CHECK-RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vcpop.m a2, v0
-; CHECK-RV32-NEXT:    vcpop.m a3, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle32.v v24, (a0)
-; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    slli a4, a4, 4
-; CHECK-RV32-NEXT:    add a4, sp, a4
-; CHECK-RV32-NEXT:    addi a4, a4, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    li a4, 24
+; CHECK-RV32-NEXT:    mul a3, a3, a4
+; CHECK-RV32-NEXT:    add a3, sp, a3
+; CHECK-RV32-NEXT:    addi a3, a3, 16
+; CHECK-RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    addi a3, sp, 16
+; CHECK-RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-RV32-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v0
 ; CHECK-RV32-NEXT:    slli a2, a2, 2
 ; CHECK-RV32-NEXT:    add a0, a0, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-RV32-NEXT:    vle32.v v24, (a0)
+; CHECK-RV32-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a2, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a2
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v24, v0
+; CHECK-RV32-NEXT:    viota.m v16, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
+; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    vmv1r.v v0, v7
+; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a1
@@ -1077,11 +1031,6 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
@@ -1108,55 +1057,68 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 32
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    viota.m v16, v0
+; CHECK-RV64-NEXT:    csrr a2, vlenb
+; CHECK-RV64-NEXT:    li a3, 24
+; CHECK-RV64-NEXT:    mul a2, a2, a3
+; CHECK-RV64-NEXT:    add a2, sp, a2
+; CHECK-RV64-NEXT:    addi a2, a2, 16
+; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vcpop.m a2, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    vle32.v v24, (a0)
+; CHECK-RV64-NEXT:    csrr a2, vlenb
+; CHECK-RV64-NEXT:    li a3, 24
+; CHECK-RV64-NEXT:    mul a2, a2, a3
+; CHECK-RV64-NEXT:    add a2, sp, a2
+; CHECK-RV64-NEXT:    addi a2, a2, 16
+; CHECK-RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT:    addi a2, sp, 16
+; CHECK-RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 4
+; CHECK-RV64-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-RV64-NEXT:    vmv.x.s a2, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a3, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-RV64-NEXT:    vle32.v v24, (a0)
-; CHECK-RV64-NEXT:    csrr a3, vlenb
-; CHECK-RV64-NEXT:    li a4, 24
-; CHECK-RV64-NEXT:    mul a3, a3, a4
-; CHECK-RV64-NEXT:    add a3, sp, a3
-; CHECK-RV64-NEXT:    addi a3, a3, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a3, v7
+; CHECK-RV64-NEXT:    vcpop.m a3, v24
 ; CHECK-RV64-NEXT:    cpopw a2, a2
 ; CHECK-RV64-NEXT:    slli a2, a2, 2
 ; CHECK-RV64-NEXT:    add a0, a0, a2
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
+; CHECK-RV64-NEXT:    li a2, 24
+; CHECK-RV64-NEXT:    mul a0, a0, a2
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v24, v0
+; CHECK-RV64-NEXT:    viota.m v16, v24
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v8, v7
-; CHECK-RV64-NEXT:    vmv1r.v v0, v7
+; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vmv1r.v v0, v24
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
+; CHECK-RV64-NEXT:    li a1, 24
+; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    csrr a0, vlenb
+; CHECK-RV64-NEXT:    slli a0, a0, 3
+; CHECK-RV64-NEXT:    add a0, sp, a0
+; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -1329,33 +1291,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
+; CHECK-RV32-NEXT:    li a2, 24
+; CHECK-RV32-NEXT:    mul a1, a1, a2
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-RV32-NEXT:    viota.m v16, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle64.v v24, (a0)
-; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    li a2, 24
-; CHECK-RV32-NEXT:    mul a1, a1, a2
-; CHECK-RV32-NEXT:    add a1, sp, a1
-; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    addi a1, sp, 16
+; CHECK-RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-RV32-NEXT:    vmv.x.s a1, v0
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-RV32-NEXT:    zext.h a1, a1
 ; CHECK-RV32-NEXT:    cpop a1, a1
 ; CHECK-RV32-NEXT:    slli a1, a1, 3
 ; CHECK-RV32-NEXT:    add a0, a0, a1
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a1, v7
+; CHECK-RV32-NEXT:    vcpop.m a1, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle64.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
@@ -1364,18 +1327,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v24, v0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v8, v7
-; CHECK-RV32-NEXT:    vmv1r.v v0, v7
+; CHECK-RV32-NEXT:    viota.m v8, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
@@ -1390,7 +1342,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1402,33 +1355,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 5
+; CHECK-RV64-NEXT:    li a2, 24
+; CHECK-RV64-NEXT:    mul a1, a1, a2
 ; CHECK-RV64-NEXT:    sub sp, sp, a1
-; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
 ; CHECK-RV64-NEXT:    slli a1, a1, 4
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a1, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-RV64-NEXT:    viota.m v16, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle64.v v24, (a0)
-; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    li a2, 24
-; CHECK-RV64-NEXT:    mul a1, a1, a2
-; CHECK-RV64-NEXT:    add a1, sp, a1
-; CHECK-RV64-NEXT:    addi a1, a1, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT:    addi a1, sp, 16
+; CHECK-RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-RV64-NEXT:    vmv.x.s a1, v0
 ; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-RV64-NEXT:    zext.h a1, a1
 ; CHECK-RV64-NEXT:    cpopw a1, a1
 ; CHECK-RV64-NEXT:    slli a1, a1, 3
 ; CHECK-RV64-NEXT:    add a0, a0, a1
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a1, v7
+; CHECK-RV64-NEXT:    vcpop.m a1, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle64.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
@@ -1437,18 +1391,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v24, v0
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v8, v7
-; CHECK-RV64-NEXT:    vmv1r.v v0, v7
+; CHECK-RV64-NEXT:    viota.m v8, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
@@ -1463,7 +1406,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 5
+; CHECK-RV64-NEXT:    li a1, 24
+; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add sp, sp, a0
 ; CHECK-RV64-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -1491,13 +1435,12 @@ define <512 x i8> @test_expandload_v512i8(ptr %base, <512 x i1> %mask, <512 x i8
 ; CHECK-LABEL: test_expandload_v512i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 512
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    viota.m v16, v0
 ; CHECK-NEXT:    vcpop.m a2, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; CHECK-NEXT:    vle8.v v12, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    viota.m v16, v0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = call <512 x i8> @llvm.masked.expandload.v512i8(ptr align 1 %base, <512 x i1> %mask, <512 x i8> %passthru)
@@ -1630,12 +1573,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 28
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_30: # %else110
-; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    li a1, 32
+; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_32
 ; CHECK-RV32-NEXT:  # %bb.31: # %cond.load113
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -1643,13 +1586,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 29
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_32: # %else114
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v0, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_34
 ; CHECK-RV32-NEXT:  # %bb.33: # %cond.load117
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -1657,8 +1600,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 30
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_34: # %else118
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -1793,13 +1736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 61
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_66: # %else242
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 1
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_68
 ; CHECK-RV32-NEXT:  # %bb.67: # %cond.load245
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -1809,8 +1752,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 62
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_68: # %else246
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -1945,13 +1888,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 93
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_100: # %else370
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_102
 ; CHECK-RV32-NEXT:  # %bb.101: # %cond.load373
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -1961,8 +1904,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 94
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_102: # %else374
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2097,13 +2040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 125
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_134: # %else498
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_136
 ; CHECK-RV32-NEXT:  # %bb.135: # %cond.load501
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -2113,8 +2056,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 126
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_136: # %else502
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2249,13 +2192,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 157
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_168: # %else626
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_170
 ; CHECK-RV32-NEXT:  # %bb.169: # %cond.load629
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -2265,8 +2208,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 158
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_170: # %else630
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2401,13 +2344,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 189
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_202: # %else754
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 3
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_204
 ; CHECK-RV32-NEXT:  # %bb.203: # %cond.load757
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -2417,8 +2360,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 190
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_204: # %else758
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2553,13 +2496,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 221
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_236: # %else882
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_238
 ; CHECK-RV32-NEXT:  # %bb.237: # %cond.load885
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -2569,8 +2512,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 222
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_238: # %else886
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2705,13 +2648,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 253
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_270: # %else1010
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 4
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_272
 ; CHECK-RV32-NEXT:  # %bb.271: # %cond.load1013
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -2721,8 +2664,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 254
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:  .LBB61_272: # %else1014
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2859,9 +2802,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_304: # %else1138
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_306
 ; CHECK-RV32-NEXT:  # %bb.305: # %cond.load1141
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -3006,9 +2949,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_338: # %else1266
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 5
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_340
 ; CHECK-RV32-NEXT:  # %bb.339: # %cond.load1269
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -3153,9 +3096,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_372: # %else1394
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_374
 ; CHECK-RV32-NEXT:  # %bb.373: # %cond.load1397
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -3300,9 +3243,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_406: # %else1522
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 6
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_408
 ; CHECK-RV32-NEXT:  # %bb.407: # %cond.load1525
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -3447,9 +3390,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_440: # %else1650
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_442
 ; CHECK-RV32-NEXT:  # %bb.441: # %cond.load1653
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
@@ -3594,9 +3537,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_474: # %else1778
-; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v16, v0, 7
+; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_476
 ; CHECK-RV32-NEXT:  # %bb.475: # %cond.load1781
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
@@ -3741,10 +3684,10 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:  .LBB61_508: # %else1906
-; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v16, a1
-; CHECK-RV32-NEXT:    bgez a2, .LBB61_510
+; CHECK-RV32-NEXT:    slli a1, a3, 1
+; CHECK-RV32-NEXT:    bgez a1, .LBB61_510
 ; CHECK-RV32-NEXT:  # %bb.509: # %cond.load1909
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV32-NEXT:    vmv.s.x v24, a1
@@ -3892,8 +3835,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e8, mf8, tu, ma
 ; CHECK-RV32-NEXT:    vmv.s.x v8, a1
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 2
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_545
@@ -3904,8 +3847,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 4
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_546
@@ -3916,8 +3859,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 8
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_547
@@ -3928,8 +3871,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 3
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 16
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_548
@@ -3940,8 +3883,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 32
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_549
@@ -3952,8 +3895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 5
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 64
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_550
@@ -3964,8 +3907,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 6
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 128
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_551
@@ -3976,8 +3919,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 7
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 256
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_552
@@ -3988,8 +3931,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 8
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 512
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_553
@@ -4000,8 +3943,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 9
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_554
@@ -4012,8 +3955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 10
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 20
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_555
@@ -4024,8 +3967,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 11
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 19
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_556
@@ -4036,8 +3979,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 12
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 18
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_557
@@ -4048,8 +3991,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 13
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 17
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_558
@@ -4060,8 +4003,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 14
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 16
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_559
@@ -4072,8 +4015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 15
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 15
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_560
@@ -4084,8 +4027,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 16
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 14
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_561
@@ -4096,8 +4039,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 17
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 13
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_562
@@ -4108,8 +4051,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 18
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 12
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_563
@@ -4120,8 +4063,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 19
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 11
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_564
@@ -4132,8 +4075,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 20
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 10
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_565
@@ -4144,8 +4087,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 21
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 9
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_566
@@ -4156,8 +4099,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 22
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 8
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_567
@@ -4168,8 +4111,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 23
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 7
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_568
@@ -4180,8 +4123,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 24
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 6
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_569
@@ -4192,8 +4135,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 25
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 5
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_570
@@ -4204,8 +4147,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 26
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 4
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_571
@@ -4216,8 +4159,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 27
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 3
 ; CHECK-RV32-NEXT:    bgez a1, .LBB61_1025
@@ -4231,8 +4174,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 31
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_573
@@ -4246,8 +4189,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_574
@@ -4261,8 +4204,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 33
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_575
@@ -4276,8 +4219,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 34
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_576
@@ -4291,8 +4234,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 35
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_577
@@ -4306,8 +4249,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 36
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_578
@@ -4321,8 +4264,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 37
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_579
@@ -4336,8 +4279,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 38
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_580
@@ -4351,8 +4294,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 39
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_581
@@ -4366,8 +4309,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 40
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_582
@@ -4381,8 +4324,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 41
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_583
@@ -4396,8 +4339,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 42
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_584
@@ -4411,8 +4354,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 43
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_585
@@ -4426,8 +4369,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 44
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_586
@@ -4441,8 +4384,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 45
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_587
@@ -4456,8 +4399,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 46
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_588
@@ -4471,8 +4414,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 47
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_589
@@ -4486,8 +4429,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 48
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_590
@@ -4501,8 +4444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 49
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_591
@@ -4516,8 +4459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 50
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_592
@@ -4531,8 +4474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 51
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_593
@@ -4546,8 +4489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 52
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_594
@@ -4561,8 +4504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 53
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_595
@@ -4576,8 +4519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 54
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_596
@@ -4591,8 +4534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 55
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_597
@@ -4606,8 +4549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 56
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_598
@@ -4621,8 +4564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 57
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_599
@@ -4636,8 +4579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 58
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_600
@@ -4651,8 +4594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 59
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_601
@@ -4666,8 +4609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 60
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1026
@@ -4682,8 +4625,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_603
@@ -4697,8 +4640,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 64
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_604
@@ -4712,8 +4655,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 65
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_605
@@ -4727,8 +4670,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 66
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_606
@@ -4742,8 +4685,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 67
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_607
@@ -4757,8 +4700,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 68
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_608
@@ -4772,8 +4715,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 69
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_609
@@ -4787,8 +4730,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 70
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_610
@@ -4802,8 +4745,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 71
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_611
@@ -4817,8 +4760,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 72
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_612
@@ -4832,8 +4775,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 73
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_613
@@ -4847,8 +4790,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 74
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_614
@@ -4862,8 +4805,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 75
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_615
@@ -4877,8 +4820,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 76
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_616
@@ -4892,8 +4835,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 77
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_617
@@ -4907,8 +4850,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 78
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_618
@@ -4922,8 +4865,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 79
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_619
@@ -4937,8 +4880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 80
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_620
@@ -4952,8 +4895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 81
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_621
@@ -4967,8 +4910,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 82
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_622
@@ -4982,8 +4925,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 83
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_623
@@ -4997,8 +4940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 84
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_624
@@ -5012,8 +4955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 85
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_625
@@ -5027,8 +4970,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 86
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_626
@@ -5042,8 +4985,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 87
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_627
@@ -5057,8 +5000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 88
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_628
@@ -5072,8 +5015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 89
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_629
@@ -5087,8 +5030,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 90
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_630
@@ -5102,8 +5045,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 91
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_631
@@ -5117,8 +5060,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 92
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1027
@@ -5133,8 +5076,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 95
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_633
@@ -5148,8 +5091,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 96
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_634
@@ -5163,8 +5106,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 97
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_635
@@ -5178,8 +5121,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 98
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_636
@@ -5193,8 +5136,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 99
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_637
@@ -5208,8 +5151,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 100
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_638
@@ -5223,8 +5166,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 101
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_639
@@ -5238,8 +5181,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 102
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_640
@@ -5253,8 +5196,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 103
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_641
@@ -5268,8 +5211,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 104
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_642
@@ -5283,8 +5226,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 105
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_643
@@ -5298,8 +5241,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 106
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_644
@@ -5313,8 +5256,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 107
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_645
@@ -5328,8 +5271,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 108
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_646
@@ -5343,8 +5286,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 109
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_647
@@ -5358,8 +5301,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 110
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_648
@@ -5373,8 +5316,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 111
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_649
@@ -5388,8 +5331,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 112
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_650
@@ -5403,8 +5346,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 113
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_651
@@ -5418,8 +5361,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 114
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_652
@@ -5433,8 +5376,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 115
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_653
@@ -5448,8 +5391,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 116
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_654
@@ -5463,8 +5406,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 117
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_655
@@ -5478,8 +5421,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 118
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_656
@@ -5493,8 +5436,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 119
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_657
@@ -5508,8 +5451,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 120
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_658
@@ -5523,8 +5466,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 121
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_659
@@ -5538,8 +5481,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 122
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_660
@@ -5553,8 +5496,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 123
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_661
@@ -5568,8 +5511,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 124
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1028
@@ -5584,8 +5527,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 127
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_663
@@ -5599,8 +5542,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 128
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_664
@@ -5614,8 +5557,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 129
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_665
@@ -5629,8 +5572,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 130
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_666
@@ -5644,8 +5587,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 131
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_667
@@ -5659,8 +5602,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 132
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_668
@@ -5674,8 +5617,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 133
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_669
@@ -5689,8 +5632,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 134
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_670
@@ -5704,8 +5647,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 135
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_671
@@ -5719,8 +5662,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 136
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_672
@@ -5734,8 +5677,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 137
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_673
@@ -5749,8 +5692,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 138
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_674
@@ -5764,8 +5707,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 139
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_675
@@ -5779,8 +5722,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 140
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_676
@@ -5794,8 +5737,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 141
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_677
@@ -5809,8 +5752,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 142
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_678
@@ -5824,8 +5767,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 143
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_679
@@ -5839,8 +5782,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 144
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_680
@@ -5854,8 +5797,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 145
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_681
@@ -5869,8 +5812,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 146
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_682
@@ -5884,8 +5827,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 147
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_683
@@ -5899,8 +5842,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 148
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_684
@@ -5914,8 +5857,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 149
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_685
@@ -5929,8 +5872,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 150
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_686
@@ -5944,8 +5887,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 151
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_687
@@ -5959,8 +5902,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 152
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_688
@@ -5974,8 +5917,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 153
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_689
@@ -5989,8 +5932,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 154
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_690
@@ -6004,8 +5947,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 155
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_691
@@ -6019,8 +5962,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 156
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1029
@@ -6035,8 +5978,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 159
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_693
@@ -6050,8 +5993,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 160
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_694
@@ -6065,8 +6008,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 161
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_695
@@ -6080,8 +6023,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 162
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_696
@@ -6095,8 +6038,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 163
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_697
@@ -6110,8 +6053,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 164
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_698
@@ -6125,8 +6068,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 165
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_699
@@ -6140,8 +6083,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 166
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_700
@@ -6155,8 +6098,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 167
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_701
@@ -6170,8 +6113,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 168
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_702
@@ -6185,8 +6128,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 169
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_703
@@ -6200,8 +6143,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 170
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_704
@@ -6215,8 +6158,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 171
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_705
@@ -6230,8 +6173,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 172
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_706
@@ -6245,8 +6188,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 173
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_707
@@ -6260,8 +6203,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 174
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_708
@@ -6275,8 +6218,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 175
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_709
@@ -6290,8 +6233,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 176
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_710
@@ -6305,8 +6248,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 177
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_711
@@ -6320,8 +6263,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 178
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_712
@@ -6335,8 +6278,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 179
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_713
@@ -6350,8 +6293,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 180
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_714
@@ -6365,8 +6308,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 181
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_715
@@ -6380,8 +6323,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 182
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_716
@@ -6395,8 +6338,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 183
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_717
@@ -6410,8 +6353,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 184
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_718
@@ -6425,8 +6368,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 185
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_719
@@ -6440,8 +6383,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 186
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_720
@@ -6455,8 +6398,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 187
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_721
@@ -6470,8 +6413,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 188
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1030
@@ -6486,8 +6429,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 191
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_723
@@ -6501,8 +6444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 192
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 2
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_724
@@ -6516,8 +6459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 193
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 4
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_725
@@ -6531,8 +6474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 194
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 8
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_726
@@ -6546,8 +6489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 195
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 16
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_727
@@ -6561,8 +6504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 196
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 32
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_728
@@ -6576,8 +6519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 197
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 64
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_729
@@ -6591,8 +6534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 198
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 128
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_730
@@ -6606,8 +6549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 199
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 256
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_731
@@ -6621,8 +6564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 200
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 512
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_732
@@ -6636,8 +6579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 201
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_733
@@ -6651,8 +6594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 202
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 20
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_734
@@ -6666,8 +6609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 203
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 19
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_735
@@ -6681,8 +6624,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 204
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 18
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_736
@@ -6696,8 +6639,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 205
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 17
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_737
@@ -6711,8 +6654,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 206
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 16
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_738
@@ -6726,8 +6669,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 207
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 15
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_739
@@ -6741,8 +6684,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 208
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 14
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_740
@@ -6756,8 +6699,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 209
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 13
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_741
@@ -6771,8 +6714,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 210
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 12
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_742
@@ -6786,8 +6729,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 211
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 11
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_743
@@ -6801,8 +6744,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 212
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 10
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_744
@@ -6816,8 +6759,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 213
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 9
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_745
@@ -6831,8 +6774,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 214
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 8
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_746
@@ -6846,8 +6789,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 215
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 7
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_747
@@ -6861,8 +6804,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 216
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 6
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_748
@@ -6876,8 +6819,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 217
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 5
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_749
@@ -6891,8 +6834,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 218
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 4
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_750
@@ -6906,8 +6849,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 219
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 3
 ; CHECK-RV32-NEXT:    bltz a2, .LBB61_751
@@ -6921,8 +6864,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 220
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_1031
@@ -6937,8 +6880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 223
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_753
@@ -6952,8 +6895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 224
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_754
@@ -6967,8 +6910,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 225
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_755
@@ -6982,8 +6925,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 226
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_756
@@ -6997,8 +6940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 227
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_757
@@ -7012,8 +6955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 228
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_758
@@ -7027,8 +6970,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 229
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_759
@@ -7042,8 +6985,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 230
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_760
@@ -7057,8 +7000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 231
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_761
@@ -7072,8 +7015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 232
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_762
@@ -7087,8 +7030,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 233
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_763
@@ -7102,8 +7045,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 234
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_764
@@ -7117,8 +7060,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 235
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_765
@@ -7132,8 +7075,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 236
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_766
@@ -7147,8 +7090,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 237
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_767
@@ -7162,8 +7105,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 238
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_768
@@ -7177,8 +7120,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 239
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_769
@@ -7192,8 +7135,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 240
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_770
@@ -7207,8 +7150,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 241
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_771
@@ -7222,8 +7165,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 242
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_772
@@ -7237,8 +7180,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 243
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_773
@@ -7252,8 +7195,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 244
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_774
@@ -7267,8 +7210,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 245
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_775
@@ -7282,8 +7225,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 246
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_776
@@ -7297,8 +7240,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 247
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_777
@@ -7312,8 +7255,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 248
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_778
@@ -7327,8 +7270,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 249
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_779
@@ -7342,8 +7285,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 250
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_780
@@ -7357,8 +7300,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 251
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_781
@@ -7372,8 +7315,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 252
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1032
@@ -7388,8 +7331,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    li a4, 255
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
-; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
+; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV32-NEXT:    andi a2, a3, 1
 ; CHECK-RV32-NEXT:    bnez a2, .LBB61_783
@@ -10794,13 +10737,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 61
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_63: # %else242
-; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 1
+; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_65
 ; CHECK-RV64-NEXT:  # %bb.64: # %cond.load245
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
@@ -10810,8 +10753,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 62
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v24, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV64-NEXT:  .LBB61_65: # %else246
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11074,13 +11017,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 125
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_129: # %else498
-; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_131
 ; CHECK-RV64-NEXT:  # %bb.130: # %cond.load501
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
@@ -11090,8 +11033,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 126
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v24, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV64-NEXT:  .LBB61_131: # %else502
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11354,13 +11297,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 189
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_195: # %else754
-; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 3
+; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_197
 ; CHECK-RV64-NEXT:  # %bb.196: # %cond.load757
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
@@ -11370,8 +11313,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 190
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV64-NEXT:  .LBB61_197: # %else758
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11634,13 +11577,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 253
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_261: # %else1010
-; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 4
+; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_263
 ; CHECK-RV64-NEXT:  # %bb.262: # %cond.load1013
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
@@ -11650,8 +11593,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 254
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
 ; CHECK-RV64-NEXT:  .LBB61_263: # %else1014
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11916,9 +11859,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_327: # %else1266
-; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 5
+; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_329
 ; CHECK-RV64-NEXT:  # %bb.328: # %cond.load1269
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
@@ -12191,9 +12134,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_393: # %else1522
-; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 6
+; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_395
 ; CHECK-RV64-NEXT:  # %bb.394: # %cond.load1525
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
@@ -12466,9 +12409,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:  .LBB61_459: # %else1778
-; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v0, 7
+; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_461
 ; CHECK-RV64-NEXT:  # %bb.460: # %cond.load1781
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
@@ -12745,8 +12688,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, zero, e8, mf8, tu, ma
 ; CHECK-RV64-NEXT:    vmv.s.x v8, a1
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 2
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_528
@@ -12757,8 +12700,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_529
@@ -12769,8 +12712,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_530
@@ -12781,8 +12724,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_531
@@ -12793,8 +12736,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 4
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_532
@@ -12805,8 +12748,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 5
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_533
@@ -12817,8 +12760,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 6
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_534
@@ -12829,8 +12772,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 7
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_535
@@ -12841,8 +12784,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 8
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_536
@@ -12853,8 +12796,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 9
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_537
@@ -12865,8 +12808,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 10
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_538
@@ -12877,8 +12820,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 11
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_539
@@ -12889,8 +12832,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 12
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_540
@@ -12901,8 +12844,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 13
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_541
@@ -12913,8 +12856,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 14
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_542
@@ -12925,8 +12868,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 15
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_543
@@ -12937,8 +12880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 16
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_544
@@ -12949,8 +12892,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 17
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_545
@@ -12961,8 +12904,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 18
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_546
@@ -12973,8 +12916,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 19
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_547
@@ -12985,8 +12928,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 20
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_548
@@ -12997,8 +12940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 21
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_549
@@ -13009,8 +12952,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 22
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_550
@@ -13021,8 +12964,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 23
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_551
@@ -13033,8 +12976,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 24
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_552
@@ -13045,8 +12988,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 25
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_553
@@ -13057,8 +13000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 26
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_554
@@ -13069,8 +13012,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 27
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_555
@@ -13081,8 +13024,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 28
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_556
@@ -13093,8 +13036,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 29
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_557
@@ -13105,8 +13048,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 30
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_558
@@ -13119,8 +13062,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a1, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 31
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 31
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_559
@@ -13134,8 +13077,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 30
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_560
@@ -13149,8 +13092,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 33
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 29
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_561
@@ -13164,8 +13107,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 34
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 28
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_562
@@ -13179,8 +13122,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 35
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 27
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_563
@@ -13194,8 +13137,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 36
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 26
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_564
@@ -13209,8 +13152,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 37
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 25
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_565
@@ -13224,8 +13167,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 38
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 24
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_566
@@ -13239,8 +13182,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 39
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 23
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_567
@@ -13254,8 +13197,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 40
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 22
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_568
@@ -13269,8 +13212,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 41
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 21
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_569
@@ -13284,8 +13227,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 42
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 20
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_570
@@ -13299,8 +13242,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 43
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 19
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_571
@@ -13314,8 +13257,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 44
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 18
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_572
@@ -13329,8 +13272,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 45
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 17
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_573
@@ -13344,8 +13287,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 46
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 16
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_574
@@ -13359,8 +13302,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 47
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 15
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_575
@@ -13374,8 +13317,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 48
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 14
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_576
@@ -13389,8 +13332,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 49
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 13
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_577
@@ -13404,8 +13347,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 50
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 12
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_578
@@ -13419,8 +13362,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 51
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 11
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_579
@@ -13434,8 +13377,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 52
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 10
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_580
@@ -13449,8 +13392,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 53
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 9
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_581
@@ -13464,8 +13407,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 54
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 8
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_582
@@ -13479,8 +13422,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 55
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 7
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_583
@@ -13494,8 +13437,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 56
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 6
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_584
@@ -13509,8 +13452,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 57
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 5
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_585
@@ -13524,8 +13467,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 58
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 4
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_586
@@ -13539,8 +13482,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 59
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 3
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_587
@@ -13554,8 +13497,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 60
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 2
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_1025
@@ -13570,8 +13513,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 63
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_589
@@ -13585,8 +13528,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 64
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 2
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_590
@@ -13600,8 +13543,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 65
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 4
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_591
@@ -13615,8 +13558,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 66
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 8
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_592
@@ -13630,8 +13573,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 67
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 16
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_593
@@ -13645,8 +13588,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 68
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 32
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_594
@@ -13660,8 +13603,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 69
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 64
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_595
@@ -13675,8 +13618,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 70
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 128
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_596
@@ -13690,8 +13633,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 71
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 256
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_597
@@ -13705,8 +13648,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 72
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 512
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_598
@@ -13720,8 +13663,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 73
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1024
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_599
@@ -13735,8 +13678,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 74
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 52
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_600
@@ -13750,8 +13693,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 75
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 51
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_601
@@ -13765,8 +13708,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 76
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 50
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_602
@@ -13780,8 +13723,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 77
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 49
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_603
@@ -13795,8 +13738,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 78
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 48
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_604
@@ -13810,8 +13753,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 79
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 47
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_605
@@ -13825,8 +13768,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 80
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 46
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_606
@@ -13840,8 +13783,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 81
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 45
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_607
@@ -13855,8 +13798,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 82
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 44
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_608
@@ -13870,8 +13813,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 83
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 43
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_609
@@ -13885,8 +13828,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 84
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 42
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_610
@@ -13900,8 +13843,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 85
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 41
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_611
@@ -13915,8 +13858,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 86
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 40
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_612
@@ -13930,8 +13873,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 87
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 39
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_613
@@ -13945,8 +13888,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 88
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 38
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_614
@@ -13960,8 +13903,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 89
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 37
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_615
@@ -13975,8 +13918,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 90
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 36
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_616
@@ -13990,8 +13933,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 91
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 35
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_617
@@ -14005,8 +13948,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 92
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 34
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_618
@@ -14020,8 +13963,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 93
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 33
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_619
@@ -14035,8 +13978,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 94
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 32
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_620
@@ -14050,8 +13993,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 95
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 31
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_621
@@ -14065,8 +14008,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 96
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 30
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_622
@@ -14080,8 +14023,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 97
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 29
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_623
@@ -14095,8 +14038,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 98
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 28
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_624
@@ -14110,8 +14053,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 99
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 27
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_625
@@ -14125,8 +14068,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 100
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 26
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_626
@@ -14140,8 +14083,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 101
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 25
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_627
@@ -14155,8 +14098,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 102
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 24
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_628
@@ -14170,8 +14113,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 103
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 23
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_629
@@ -14185,8 +14128,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 104
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 22
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_630
@@ -14200,8 +14143,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 105
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 21
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_631
@@ -14215,8 +14158,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 106
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 20
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_632
@@ -14230,8 +14173,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 107
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 19
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_633
@@ -14245,8 +14188,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 108
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 18
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_634
@@ -14260,8 +14203,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 109
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 17
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_635
@@ -14275,8 +14218,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 110
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 16
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_636
@@ -14290,8 +14233,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 111
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 15
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_637
@@ -14305,8 +14248,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 112
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 14
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_638
@@ -14320,8 +14263,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 113
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 13
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_639
@@ -14335,8 +14278,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 114
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 12
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_640
@@ -14350,8 +14293,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 115
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 11
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_641
@@ -14365,8 +14308,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 116
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 10
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_642
@@ -14380,8 +14323,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 117
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 9
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_643
@@ -14395,8 +14338,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 118
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 8
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_644
@@ -14410,8 +14353,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 119
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 7
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_645
@@ -14425,8 +14368,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 120
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 6
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_646
@@ -14440,8 +14383,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 121
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 5
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_647
@@ -14455,8 +14398,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 122
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 4
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_648
@@ -14470,8 +14413,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 123
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 3
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_649
@@ -14485,8 +14428,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 124
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 2
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_1026
@@ -14501,8 +14444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 127
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_651
@@ -14516,8 +14459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 128
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 2
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_652
@@ -14531,8 +14474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 129
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_653
@@ -14546,8 +14489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 130
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_654
@@ -14561,8 +14504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 131
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_655
@@ -14576,8 +14519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 132
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_656
@@ -14591,8 +14534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 133
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_657
@@ -14606,8 +14549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 134
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_658
@@ -14621,8 +14564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 135
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_659
@@ -14636,8 +14579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 136
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_660
@@ -14651,8 +14594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 137
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_661
@@ -14666,8 +14609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 138
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_662
@@ -14681,8 +14624,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 139
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_663
@@ -14696,8 +14639,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 140
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_664
@@ -14711,8 +14654,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 141
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_665
@@ -14726,8 +14669,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 142
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_666
@@ -14741,8 +14684,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 143
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_667
@@ -14756,8 +14699,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 144
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_668
@@ -14771,8 +14714,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 145
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_669
@@ -14786,8 +14729,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 146
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_670
@@ -14801,8 +14744,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 147
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_671
@@ -14816,8 +14759,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 148
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_672
@@ -14831,8 +14774,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 149
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_673
@@ -14846,8 +14789,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 150
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_674
@@ -14861,8 +14804,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 151
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_675
@@ -14876,8 +14819,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 152
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_676
@@ -14891,8 +14834,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 153
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_677
@@ -14906,8 +14849,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 154
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_678
@@ -14921,8 +14864,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 155
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_679
@@ -14936,8 +14879,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 156
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_680
@@ -14951,8 +14894,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 157
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_681
@@ -14966,8 +14909,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 158
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_682
@@ -14981,8 +14924,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 159
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 31
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_683
@@ -14996,8 +14939,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 160
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 30
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_684
@@ -15011,8 +14954,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 161
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 29
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_685
@@ -15026,8 +14969,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 162
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 28
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_686
@@ -15041,8 +14984,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 163
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 27
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_687
@@ -15056,8 +14999,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 164
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 26
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_688
@@ -15071,8 +15014,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 165
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 25
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_689
@@ -15086,8 +15029,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 166
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 24
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_690
@@ -15101,8 +15044,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 167
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 23
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_691
@@ -15116,8 +15059,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 168
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 22
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_692
@@ -15131,8 +15074,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 169
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 21
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_693
@@ -15146,8 +15089,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 170
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 20
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_694
@@ -15161,8 +15104,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 171
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 19
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_695
@@ -15176,8 +15119,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 172
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 18
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_696
@@ -15191,8 +15134,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 173
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 17
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_697
@@ -15206,8 +15149,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 174
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 16
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_698
@@ -15221,8 +15164,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 175
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 15
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_699
@@ -15236,8 +15179,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 176
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 14
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_700
@@ -15251,8 +15194,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 177
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 13
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_701
@@ -15266,8 +15209,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 178
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 12
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_702
@@ -15281,8 +15224,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 179
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 11
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_703
@@ -15296,8 +15239,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 180
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 10
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_704
@@ -15311,8 +15254,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 181
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 9
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_705
@@ -15326,8 +15269,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 182
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 8
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_706
@@ -15341,8 +15284,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 183
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 7
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_707
@@ -15356,8 +15299,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 184
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 6
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_708
@@ -15371,8 +15314,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 185
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 5
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_709
@@ -15386,8 +15329,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 186
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 4
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_710
@@ -15401,8 +15344,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 187
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 3
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_711
@@ -15416,8 +15359,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 188
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 2
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_1027
@@ -15432,8 +15375,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 191
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_713
@@ -15447,8 +15390,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 192
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 2
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_714
@@ -15462,8 +15405,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 193
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 4
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_715
@@ -15477,8 +15420,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 194
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 8
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_716
@@ -15492,8 +15435,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 195
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 16
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_717
@@ -15507,8 +15450,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 196
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 32
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_718
@@ -15522,8 +15465,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 197
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 64
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_719
@@ -15537,8 +15480,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 198
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 128
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_720
@@ -15552,8 +15495,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 199
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 256
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_721
@@ -15567,8 +15510,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 200
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 512
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_722
@@ -15582,8 +15525,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 201
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1024
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_723
@@ -15597,8 +15540,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 202
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 52
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_724
@@ -15612,8 +15555,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 203
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 51
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_725
@@ -15627,8 +15570,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 204
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 50
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_726
@@ -15642,8 +15585,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 205
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 49
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_727
@@ -15657,8 +15600,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 206
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 48
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_728
@@ -15672,8 +15615,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 207
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 47
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_729
@@ -15687,8 +15630,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 208
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 46
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_730
@@ -15702,8 +15645,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 209
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 45
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_731
@@ -15717,8 +15660,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 210
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 44
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_732
@@ -15732,8 +15675,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 211
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 43
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_733
@@ -15747,8 +15690,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 212
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 42
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_734
@@ -15762,8 +15705,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 213
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 41
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_735
@@ -15777,8 +15720,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 214
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 40
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_736
@@ -15792,8 +15735,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 215
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 39
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_737
@@ -15807,8 +15750,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 216
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 38
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_738
@@ -15822,8 +15765,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 217
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 37
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_739
@@ -15837,8 +15780,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 218
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 36
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_740
@@ -15852,8 +15795,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 219
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 35
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_741
@@ -15867,8 +15810,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 220
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 34
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_742
@@ -15882,8 +15825,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 221
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 33
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_743
@@ -15897,8 +15840,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 222
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 32
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_744
@@ -15912,8 +15855,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 223
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 31
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_745
@@ -15927,8 +15870,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 224
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 30
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_746
@@ -15942,8 +15885,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 225
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 29
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_747
@@ -15957,8 +15900,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 226
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 28
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_748
@@ -15972,8 +15915,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 227
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 27
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_749
@@ -15987,8 +15930,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 228
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 26
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_750
@@ -16002,8 +15945,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 229
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 25
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_751
@@ -16017,8 +15960,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 230
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 24
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_752
@@ -16032,8 +15975,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 231
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 23
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_753
@@ -16047,8 +15990,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 232
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 22
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_754
@@ -16062,8 +16005,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 233
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 21
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_755
@@ -16077,8 +16020,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 234
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 20
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_756
@@ -16092,8 +16035,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 235
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 19
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_757
@@ -16107,8 +16050,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 236
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 18
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_758
@@ -16122,8 +16065,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 237
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 17
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_759
@@ -16137,8 +16080,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 238
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 16
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_760
@@ -16152,8 +16095,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 239
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 15
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_761
@@ -16167,8 +16110,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 240
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 14
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_762
@@ -16182,8 +16125,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 241
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 13
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_763
@@ -16197,8 +16140,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 242
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 12
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_764
@@ -16212,8 +16155,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 243
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 11
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_765
@@ -16227,8 +16170,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 244
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 10
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_766
@@ -16242,8 +16185,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 245
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 9
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_767
@@ -16257,8 +16200,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 246
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 8
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_768
@@ -16272,8 +16215,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 247
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 7
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_769
@@ -16287,8 +16230,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 248
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 6
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_770
@@ -16302,8 +16245,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 249
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 5
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_771
@@ -16317,8 +16260,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 250
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 4
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_772
@@ -16332,8 +16275,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 251
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 3
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_773
@@ -16347,8 +16290,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 252
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 2
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_1028
@@ -16363,8 +16306,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    li a3, 255
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
 ; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
-; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_775
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index d60ce408278da..2961b880bdceb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -1330,14 +1330,14 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    slli a3, a2, 3
+; RV64-NEXT:    slli a1, a2, 3
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    vs8r.v v16, (a1)
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a2, a2, 1
-; RV64-NEXT:    add a3, a0, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    bltu a2, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 796f8dde58f47..4664a48a2d668 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -7,9 +7,9 @@ define i1 @extractelt_nxv1i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -24,9 +24,9 @@ define i1 @extractelt_nxv2i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -41,9 +41,9 @@ define i1 @extractelt_nxv4i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -58,9 +58,9 @@ define i1 @extractelt_nxv8i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -140,14 +140,14 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    addi a3, sp, 64
-; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    vl8r.v v24, (a0)
-; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-NEXT:    vmseq.vi v0, v8, 0
+; RV32-NEXT:    vl8r.v v8, (a0)
+; RV32-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vmseq.vi v0, v8, 0
+; RV32-NEXT:    vl8r.v v24, (a0)
 ; RV32-NEXT:    add a2, a3, a2
 ; RV32-NEXT:    vmseq.vi v8, v24, 0
 ; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
@@ -180,14 +180,14 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    addi a3, sp, 64
-; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    vl8r.v v24, (a0)
-; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vl8r.v v8, (a0)
+; RV64-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vl8r.v v24, (a0)
 ; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    vmseq.vi v8, v24, 0
 ; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index a9e129ef11a2c..1546276381021 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -863,14 +863,14 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a0, sp, 64
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    slli a1, a2, 3
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    srli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    vs8r.v v16, (a3)
 ; CHECK-NEXT:    bltu a2, a1, .LBB74_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 1626b362fed15..1263094f3ace0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -10,11 +10,11 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -33,11 +33,11 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -56,11 +56,11 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -79,11 +79,11 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -102,11 +102,11 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -125,11 +125,11 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -150,9 +150,9 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -173,9 +173,9 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -196,9 +196,9 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -219,9 +219,9 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -242,9 +242,9 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -263,11 +263,11 @@ define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -286,11 +286,11 @@ define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -309,11 +309,11 @@ define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -332,11 +332,11 @@ define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 4aca2d694dfbb..e8a787f7b615e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -18,11 +18,11 @@ define <vscale x 1 x bfloat> @ceil_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -41,11 +41,11 @@ define <vscale x 2 x bfloat> @ceil_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -64,11 +64,11 @@ define <vscale x 4 x bfloat> @ceil_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -87,11 +87,11 @@ define <vscale x 8 x bfloat> @ceil_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -110,11 +110,11 @@ define <vscale x 16 x bfloat> @ceil_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -133,11 +133,11 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -167,12 +167,12 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -185,11 +185,11 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -206,12 +206,12 @@ declare <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half>)
 define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -224,11 +224,11 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -245,12 +245,12 @@ declare <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -263,11 +263,11 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -284,12 +284,12 @@ declare <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half>)
 define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -302,11 +302,11 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -323,12 +323,12 @@ declare <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half>)
 define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -341,11 +341,11 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -362,12 +362,12 @@ declare <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half>)
 define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFH-LABEL: ceil_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -380,11 +380,11 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -419,8 +419,8 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -439,8 +439,8 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -459,8 +459,8 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -479,8 +479,8 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -499,8 +499,8 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -515,12 +515,12 @@ declare <vscale x 16 x float> @llvm.ceil.nxv16f32(<vscale x 16 x float>)
 define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-LABEL: ceil_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -535,12 +535,12 @@ declare <vscale x 1 x double> @llvm.ceil.nxv1f64(<vscale x 1 x double>)
 define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-LABEL: ceil_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -555,12 +555,12 @@ declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: ceil_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -575,12 +575,12 @@ declare <vscale x 4 x double> @llvm.ceil.nxv4f64(<vscale x 4 x double>)
 define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-LABEL: ceil_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index d93f15ec44053..c3d7a9b3e877c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -10,11 +10,11 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -33,11 +33,11 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -56,11 +56,11 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -79,11 +79,11 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -102,11 +102,11 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -125,11 +125,11 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -150,9 +150,9 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -173,9 +173,9 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -196,9 +196,9 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -219,9 +219,9 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -242,9 +242,9 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -263,11 +263,11 @@ define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -286,11 +286,11 @@ define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -309,11 +309,11 @@ define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -332,11 +332,11 @@ define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index 010d7786c8891..88cd31f77bbbc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -18,11 +18,11 @@ define <vscale x 1 x bfloat> @floor_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -42,11 +42,11 @@ define <vscale x 2 x bfloat> @floor_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -66,11 +66,11 @@ define <vscale x 4 x bfloat> @floor_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -90,11 +90,11 @@ define <vscale x 8 x bfloat> @floor_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -114,11 +114,11 @@ define <vscale x 16 x bfloat> @floor_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -138,11 +138,11 @@ define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -173,12 +173,12 @@ declare <vscale x 32 x bfloat> @llvm.floor.nxv32bf16(<vscale x 32 x bfloat>)
 define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFH-LABEL: floor_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -191,11 +191,11 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -212,12 +212,12 @@ declare <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half>)
 define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFH-LABEL: floor_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -230,11 +230,11 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -251,12 +251,12 @@ declare <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFH-LABEL: floor_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -269,11 +269,11 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -290,12 +290,12 @@ declare <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half>)
 define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFH-LABEL: floor_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -308,11 +308,11 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -329,12 +329,12 @@ declare <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half>)
 define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFH-LABEL: floor_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -347,11 +347,11 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -368,12 +368,12 @@ declare <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half>)
 define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFH-LABEL: floor_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -386,11 +386,11 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -425,8 +425,8 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -445,8 +445,8 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -465,8 +465,8 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -485,8 +485,8 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -505,8 +505,8 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -521,12 +521,12 @@ declare <vscale x 16 x float> @llvm.floor.nxv16f32(<vscale x 16 x float>)
 define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-LABEL: floor_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -541,12 +541,12 @@ declare <vscale x 1 x double> @llvm.floor.nxv1f64(<vscale x 1 x double>)
 define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-LABEL: floor_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -561,12 +561,12 @@ declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: floor_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -581,12 +581,12 @@ declare <vscale x 4 x double> @llvm.floor.nxv4f64(<vscale x 4 x double>)
 define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-LABEL: floor_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
index 1752dfd50d0c5..2b973c9b80828 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
@@ -22,27 +22,27 @@ define <512 x i8> @single_source(<512 x i8> %a) {
 ; CHECK-NEXT:    addi a1, sp, 512
 ; CHECK-NEXT:    vmv.x.s a2, v16
 ; CHECK-NEXT:    vslidedown.vi v24, v16, 5
-; CHECK-NEXT:    li a3, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    li a0, 432
 ; CHECK-NEXT:    vmv.v.x v8, a2
-; CHECK-NEXT:    lbu a0, 770(sp)
+; CHECK-NEXT:    lbu a1, 770(sp)
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 431
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    lbu a0, 1012(sp)
-; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 4
-; CHECK-NEXT:    li a1, 466
-; CHECK-NEXT:    vmv.s.x v16, a0
-; CHECK-NEXT:    li a0, 465
+; CHECK-NEXT:    vslidedown.vi v16, v16, 4
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a1
+; CHECK-NEXT:    li a0, 466
+; CHECK-NEXT:    lbu a1, 1012(sp)
+; CHECK-NEXT:    vmv.s.x v24, a1
+; CHECK-NEXT:    li a1, 465
 ; CHECK-NEXT:    li a2, 501
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    li a0, 500
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vslideup.vx v8, v24, a0
 ; CHECK-NEXT:    addi sp, s0, -1536
 ; CHECK-NEXT:    .cfi_def_cfa sp, 1536
 ; CHECK-NEXT:    ld ra, 1528(sp) # 8-byte Folded Reload
@@ -103,12 +103,7 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    addi s0, sp, 1536
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -512
-; CHECK-NEXT:    addi a0, sp, 1520
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    li a0, 512
@@ -127,32 +122,30 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    li a3, 465
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vse8.v v24, (a1)
-; CHECK-NEXT:    lbu a1, 985(sp)
+; CHECK-NEXT:    li a1, 478
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v0, a3
-; CHECK-NEXT:    li a2, 478
+; CHECK-NEXT:    lbu a2, 985(sp)
 ; CHECK-NEXT:    lbu a3, 1012(sp)
-; CHECK-NEXT:    vmv.s.x v24, a1
-; CHECK-NEXT:    li a1, 477
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a1
+; CHECK-NEXT:    vmv.s.x v24, a2
+; CHECK-NEXT:    li a2, 477
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a2
 ; CHECK-NEXT:    li a1, 501
+; CHECK-NEXT:    vmv.s.x v24, a3
+; CHECK-NEXT:    li a2, 500
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a2
+; CHECK-NEXT:    lui a1, %hi(.LCPI2_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI2_0)
 ; CHECK-NEXT:    lui a2, %hi(.LCPI2_1)
 ; CHECK-NEXT:    addi a2, a2, %lo(.LCPI2_1)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vle8.v v24, (a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v0, (a2)
-; CHECK-NEXT:    li a2, 500
-; CHECK-NEXT:    vmv.s.x v24, a3
-; CHECK-NEXT:    lui a3, %hi(.LCPI2_0)
-; CHECK-NEXT:    addi a3, a3, %lo(.LCPI2_0)
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a3)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a2
-; CHECK-NEXT:    addi a1, sp, 1520
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    addi sp, s0, -1536
 ; CHECK-NEXT:    .cfi_def_cfa sp, 1536
 ; CHECK-NEXT:    ld ra, 1528(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index 84da351de76ba..5f0088a47af24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -417,9 +417,9 @@ declare <32 x i64> @llvm.vp.abs.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32)
 define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_abs_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
index 425422417ec78..753a90c22a366 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
@@ -9,10 +9,10 @@ define <512 x i8> @bitcast_1024B(<256 x i16> %a, <512 x i8> %b) {
 ; VLEN256-NEXT:    addi a1, a0, 256
 ; VLEN256-NEXT:    li a2, 256
 ; VLEN256-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; VLEN256-NEXT:    vle8.v v24, (a0)
-; VLEN256-NEXT:    vle8.v v0, (a1)
-; VLEN256-NEXT:    vadd.vv v8, v24, v8
-; VLEN256-NEXT:    vadd.vv v16, v0, v16
+; VLEN256-NEXT:    vle8.v v24, (a1)
+; VLEN256-NEXT:    vle8.v v0, (a0)
+; VLEN256-NEXT:    vadd.vv v8, v0, v8
+; VLEN256-NEXT:    vadd.vv v16, v24, v16
 ; VLEN256-NEXT:    ret
 ;
 ; VLEN512-LABEL: bitcast_1024B:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 5ea4924468595..1ba173455a8f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -978,60 +978,60 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    vsll.vx v11, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v11, v8, a2
-; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v13, v8, a4
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vx v13, v13, a1
+; RV32-NEXT:    vor.vv v12, v13, v12
 ; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v11, v11, v13
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    vlse64.v v13, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v13, v13, a4
-; RV32-NEXT:    vor.vv v10, v10, v13
-; RV32-NEXT:    vsrl.vi v13, v8, 8
-; RV32-NEXT:    vand.vx v9, v9, a5
-; RV32-NEXT:    vand.vv v13, v13, v12
-; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v9, v9, v12
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vmv.v.x v12, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vmv.v.x v11, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v9, v9, v12
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v9, v9, v11
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1250,25 +1250,25 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    lui a5, 4080
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v14, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    vsrl.vi v14, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v12, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a4
-; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v18, v8, a4
+; RV32-NEXT:    vand.vx v20, v10, a5
+; RV32-NEXT:    vand.vx v10, v18, a1
+; RV32-NEXT:    vor.vv v10, v10, v16
+; RV32-NEXT:    vand.vx v16, v8, a1
+; RV32-NEXT:    vsll.vx v16, v16, a4
+; RV32-NEXT:    vor.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v18, v18, a4
-; RV32-NEXT:    vor.vv v12, v12, v18
-; RV32-NEXT:    vsrl.vi v18, v8, 8
-; RV32-NEXT:    vand.vx v14, v14, a5
-; RV32-NEXT:    vand.vv v18, v18, v16
-; RV32-NEXT:    vor.vv v14, v18, v14
+; RV32-NEXT:    vand.vv v14, v14, v16
+; RV32-NEXT:    vor.vv v14, v14, v20
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -1523,25 +1523,25 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    lui a5, 4080
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v20, v8, 24
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v20, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v16, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a4
-; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v28, v8, a4
+; RV32-NEXT:    vand.vx v4, v12, a5
+; RV32-NEXT:    vand.vx v12, v28, a1
+; RV32-NEXT:    vor.vv v12, v12, v24
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
+; RV32-NEXT:    vor.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v28, v28, a4
-; RV32-NEXT:    vor.vv v16, v16, v28
-; RV32-NEXT:    vsrl.vi v28, v8, 8
-; RV32-NEXT:    vand.vx v20, v20, a5
-; RV32-NEXT:    vand.vv v28, v28, v24
-; RV32-NEXT:    vor.vv v20, v28, v20
+; RV32-NEXT:    vand.vv v20, v20, v24
+; RV32-NEXT:    vor.vv v20, v20, v4
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 349525
@@ -1676,35 +1676,36 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a3, a4, 819
 ; RV32-NEXT:    sw a3, 32(sp)
 ; RV32-NEXT:    sw a3, 36(sp)
-; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    addi a4, a5, 1365
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    sw a4, 24(sp)
 ; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 48
+; RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a6, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a4, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
@@ -1739,14 +1740,14 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -1761,7 +1762,7 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
@@ -1869,75 +1870,76 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    li a5, 56
 ; RV32-NEXT:    lui a6, 16
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a5
-; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    vsll.vx v8, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v16, a5
 ; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    addi a6, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v16, a5
+; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a7, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v16, a6
+; RV32-NEXT:    lui a6, 4080
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    csrr a7, vlenb
+; RV32-NEXT:    slli a7, a7, 3
+; RV32-NEXT:    add a7, sp, a7
+; RV32-NEXT:    addi a7, a7, 48
+; RV32-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v16, 24
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
 ; RV32-NEXT:    addi a1, a2, -241
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    vand.vx v0, v0, a6
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    addi a2, a3, 819
 ; RV32-NEXT:    sw a2, 32(sp)
 ; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    addi a3, a4, 1365
-; RV32-NEXT:    addi a4, a6, -256
-; RV32-NEXT:    vsrl.vx v0, v8, a5
 ; RV32-NEXT:    sw a3, 24(sp)
 ; RV32-NEXT:    sw a3, 28(sp)
-; RV32-NEXT:    vand.vx v0, v0, a4
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vx v0, v0, a5
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vx v16, v16, a6
+; RV32-NEXT:    vsll.vi v16, v16, 24
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v0, v16, v8
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v0
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1947,7 +1949,7 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
@@ -2072,35 +2074,36 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a3, a4, 819
 ; RV32-NEXT:    sw a3, 32(sp)
 ; RV32-NEXT:    sw a3, 36(sp)
-; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    addi a4, a5, 1365
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    sw a4, 24(sp)
 ; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 48
+; RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT:    addi a6, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    addi a4, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
@@ -2135,14 +2138,14 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -2157,7 +2160,7 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
@@ -2265,75 +2268,76 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    li a5, 56
 ; RV32-NEXT:    lui a6, 16
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a5
-; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    vsll.vx v8, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v16, a5
 ; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    addi a6, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v16, a5
+; RV32-NEXT:    vand.vx v0, v0, a6
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a7, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v16, a6
+; RV32-NEXT:    lui a6, 4080
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    csrr a7, vlenb
+; RV32-NEXT:    slli a7, a7, 3
+; RV32-NEXT:    add a7, sp, a7
+; RV32-NEXT:    addi a7, a7, 48
+; RV32-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v16, 24
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
 ; RV32-NEXT:    addi a1, a2, -241
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    vand.vx v0, v0, a6
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    addi a2, a3, 819
 ; RV32-NEXT:    sw a2, 32(sp)
 ; RV32-NEXT:    sw a2, 36(sp)
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    addi a3, a4, 1365
-; RV32-NEXT:    addi a4, a6, -256
-; RV32-NEXT:    vsrl.vx v0, v8, a5
 ; RV32-NEXT:    sw a3, 24(sp)
 ; RV32-NEXT:    sw a3, 28(sp)
-; RV32-NEXT:    vand.vx v0, v0, a4
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vx v0, v0, a5
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a5), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    vand.vx v8, v8, a1
-; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vx v16, v16, a6
+; RV32-NEXT:    vsll.vi v16, v16, 24
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v0, v16, v8
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v16, v0
+; RV32-NEXT:    addi a3, sp, 24
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2343,7 +2347,7 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
@@ -2455,9 +2459,9 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index d765e4c0b8f6a..37caf61aac19c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -370,31 +370,31 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    vsrl.vi v10, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    vsll.vx v11, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v11, v8, a2
-; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v13, v8, a4
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vx v13, v13, a1
+; RV32-NEXT:    vor.vv v12, v13, v12
 ; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v11, v11, v13
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    vlse64.v v13, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v13, v13, a4
-; RV32-NEXT:    vor.vv v10, v10, v13
-; RV32-NEXT:    vsrl.vi v13, v8, 8
-; RV32-NEXT:    vand.vx v9, v9, a5
-; RV32-NEXT:    vand.vv v13, v13, v12
-; RV32-NEXT:    vor.vv v9, v13, v9
-; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v13
+; RV32-NEXT:    vor.vv v9, v10, v9
+; RV32-NEXT:    vand.vv v10, v8, v13
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsll.vi v10, v10, 8
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v9, v9, v12
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -530,31 +530,31 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    vsrl.vi v12, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v14, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v14, v8, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v18, v8, a4
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vx v18, v18, a1
+; RV32-NEXT:    vor.vv v16, v18, v16
 ; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v14, v14, v18
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a6), zero
+; RV32-NEXT:    vlse64.v v18, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v18, v18, a4
-; RV32-NEXT:    vor.vv v12, v12, v18
-; RV32-NEXT:    vsrl.vi v18, v8, 8
-; RV32-NEXT:    vand.vx v10, v10, a5
-; RV32-NEXT:    vand.vv v18, v18, v16
-; RV32-NEXT:    vor.vv v10, v18, v10
-; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v18
+; RV32-NEXT:    vor.vv v10, v12, v10
+; RV32-NEXT:    vand.vv v12, v8, v18
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vor.vv v10, v10, v16
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -690,31 +690,31 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsrl.vi v16, v8, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    vsll.vx v20, v8, a2
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vsrl.vx v20, v8, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v28, v8, a4
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vx v28, v28, a1
+; RV32-NEXT:    vor.vv v24, v28, v24
 ; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v20, v24, v20
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v20, v20, v28
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a6), zero
+; RV32-NEXT:    vlse64.v v28, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v28, v28, a4
-; RV32-NEXT:    vor.vv v16, v16, v28
-; RV32-NEXT:    vsrl.vi v28, v8, 8
-; RV32-NEXT:    vand.vx v12, v12, a5
-; RV32-NEXT:    vand.vv v28, v28, v24
-; RV32-NEXT:    vor.vv v12, v28, v12
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v28
+; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vand.vv v16, v8, v28
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v20, v8
+; RV32-NEXT:    vor.vv v12, v12, v24
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -768,61 +768,63 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v16, a1, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v8, v16, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v16, a4, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
@@ -916,48 +918,48 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a3
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a6), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1031,61 +1033,63 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    lui a1, 1044480
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    addi a5, sp, 8
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v16, a1, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v8, v16, a5, v0.t
+; RV32-NEXT:    vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vlse64.v v8, (a6), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsll.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v16, a4, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
@@ -1179,48 +1183,48 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    lui a3, 16
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    lui a5, 4080
-; RV32-NEXT:    addi a6, sp, 8
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a3
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 3
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a3
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    vand.vx v0, v0, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a6), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a5
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vv v24, v24, v16
+; RV32-NEXT:    vor.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v16, v8, v16
 ; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1298,9 +1302,9 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index dbbb8362144ca..781c61b571994 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -9,10 +9,10 @@ define <4 x i32> @add_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = add i32 %a, 23
@@ -37,10 +37,10 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %e0 = add i32 %a, 23
@@ -70,10 +70,10 @@ define <4 x i32> @sub_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI2_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = sub i32 %a, 23
@@ -94,10 +94,10 @@ define <4 x i32> @mul_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI3_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = mul i32 %a, 23
@@ -125,15 +125,15 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_1)
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
-; CHECK-NEXT:    vle32.v v11, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vmulhu.vv v10, v8, v10
-; CHECK-NEXT:    vsub.vv v12, v8, v10
-; CHECK-NEXT:    vmulhu.vv v9, v12, v9
+; CHECK-NEXT:    vsub.vv v11, v8, v10
+; CHECK-NEXT:    vmulhu.vv v9, v11, v9
+; CHECK-NEXT:    vle32.v v11, (a0)
 ; CHECK-NEXT:    vadd.vv v9, v9, v10
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsrl.vv v9, v9, v11
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %e0 = udiv i32 %a, 23
@@ -155,10 +155,10 @@ define <4 x float> @fadd_constant_rhs(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = fadd float %a, 23.0
@@ -179,10 +179,10 @@ define <4 x float> @fdiv_constant_rhs(float %a, float %b, float %c, float %d) {
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI6_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI6_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vfdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = fdiv float %a, 23.0
@@ -317,10 +317,10 @@ define <4 x i32> @add_constant_rhs_inverse(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI11_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = sub i32 %a, 1
@@ -341,10 +341,10 @@ define <4 x i32> @add_constant_rhs_commute(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI12_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %e0 = add i32 %a, 23
@@ -562,21 +562,20 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vmv.s.x v12, a1
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vmv.s.x v10, a2
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 5
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    vmv.s.x v10, a2
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 6
 ; CHECK-NEXT:    vmv.s.x v10, a3
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 7
-; CHECK-NEXT:    vadd.vv v8, v8, v12
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
   %e0 = add i32 %a, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
index 60a9948198c8f..78a6acfac4581 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
@@ -87,14 +87,14 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    addi a2, a1, 256
 ; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a1, 384
-; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    addi a2, a0, 256
-; CHECK-NEXT:    vse32.v v24, (a0)
+; CHECK-NEXT:    vle32.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    vle32.v v0, (a1)
+; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vse32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vse32.v v0, (a1)
-; CHECK-NEXT:    vse32.v v16, (a2)
+; CHECK-NEXT:    vse32.v v24, (a2)
+; CHECK-NEXT:    vse32.v v16, (a1)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -207,14 +207,15 @@ define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x
 ; CHECK-NEXT:    addi s0, sp, 256
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    andi sp, sp, -128
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    mv a3, sp
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    mv a0, sp
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    li a2, 42
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    call ext3
 ; CHECK-NEXT:    addi sp, s0, -256
 ; CHECK-NEXT:    .cfi_def_cfa sp, 256
@@ -269,8 +270,8 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    mv t3, sp
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    li t4, 8
 ; CHECK-NEXT:    vse32.v v8, (t0)
+; CHECK-NEXT:    li t4, 8
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_indirect_stack
@@ -307,17 +308,15 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3
 define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) {
 ; CHECK-LABEL: pass_vector_arg_direct_stack:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -176
-; CHECK-NEXT:    .cfi_def_cfa_offset 176
-; CHECK-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd s0, 160(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
-; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    addi t0, sp, 16
 ; CHECK-NEXT:    li t1, 1
 ; CHECK-NEXT:    li t2, 13
-; CHECK-NEXT:    li s0, 12
+; CHECK-NEXT:    li t5, 12
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -326,23 +325,21 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    li t3, 8
+; CHECK-NEXT:    sd t1, 144(sp)
+; CHECK-NEXT:    li t4, 9
+; CHECK-NEXT:    sd t5, 0(sp)
+; CHECK-NEXT:    sd t2, 8(sp)
+; CHECK-NEXT:    li t5, 10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vse32.v v8, (t0)
-; CHECK-NEXT:    li t4, 9
-; CHECK-NEXT:    li t5, 10
-; CHECK-NEXT:    sd t1, 144(sp)
 ; CHECK-NEXT:    li t6, 11
-; CHECK-NEXT:    sd s0, 0(sp)
-; CHECK-NEXT:    sd t2, 8(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_direct_stack
-; CHECK-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_restore ra
-; CHECK-NEXT:    .cfi_restore s0
-; CHECK-NEXT:    addi sp, sp, 176
+; CHECK-NEXT:    addi sp, sp, 160
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index f42b4a3a26aad..34600d9a0eaf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -87,14 +87,14 @@ define <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    addi a2, a1, 256
 ; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a1, 384
-; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    addi a2, a0, 256
-; CHECK-NEXT:    vse32.v v24, (a0)
+; CHECK-NEXT:    vle32.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    vle32.v v0, (a1)
+; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vse32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vse32.v v0, (a1)
-; CHECK-NEXT:    vse32.v v16, (a2)
+; CHECK-NEXT:    vse32.v v24, (a2)
+; CHECK-NEXT:    vse32.v v16, (a1)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -207,14 +207,15 @@ define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x
 ; CHECK-NEXT:    addi s0, sp, 256
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    andi sp, sp, -128
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    mv a3, sp
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    mv a0, sp
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    li a2, 42
-; CHECK-NEXT:    vse32.v v8, (a3)
-; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    call ext3
 ; CHECK-NEXT:    addi sp, s0, -256
 ; CHECK-NEXT:    .cfi_def_cfa sp, 256
@@ -267,9 +268,9 @@ define <32 x i32> @call_split_vector_args(ptr %pa, ptr %pb) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    mv a1, sp
 ; CHECK-NEXT:    mv a0, sp
-; CHECK-NEXT:    vse32.v v16, (a1)
+; CHECK-NEXT:    vse32.v v16, (a0)
+; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vmv1r.v v11, v8
@@ -313,7 +314,7 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x
 ; CHECK-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    li t0, 8
+; CHECK-NEXT:    li a7, 8
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -322,9 +323,9 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    sd a7, 128(sp)
 ; CHECK-NEXT:    vse32.v v8, (sp)
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    sd t0, 128(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_via_stack
@@ -378,8 +379,8 @@ define <4 x i1> @pass_vector_mask_arg_via_stack(<4 x i1> %v) {
 ; CHECK-NEXT:    vmv.v.v v17, v16
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v16, v17, 0
-; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    vsm.v v16, (a2)
+; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    li a2, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index a9b255bb62aeb..3c79f42177721 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.ceil.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -35,12 +35,12 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -59,12 +59,12 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -77,11 +77,11 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.ceil.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -121,12 +121,12 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -145,12 +145,12 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,11 +163,11 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.ceil.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -207,12 +207,12 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -231,12 +231,12 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -249,11 +249,11 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -273,12 +273,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 3
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -295,12 +295,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -319,12 +319,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 3
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -337,11 +337,11 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,9 +363,9 @@ define <2 x float> @vp_ceil_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -384,8 +384,8 @@ define <2 x float> @vp_ceil_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -405,9 +405,9 @@ define <4 x float> @vp_ceil_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -426,8 +426,8 @@ define <4 x float> @vp_ceil_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,9 +448,9 @@ define <8 x float> @vp_ceil_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -470,8 +470,8 @@ define <8 x float> @vp_ceil_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -492,9 +492,9 @@ define <16 x float> @vp_ceil_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -514,8 +514,8 @@ define <16 x float> @vp_ceil_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.ceil.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -552,12 +552,12 @@ define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %e
 define <2 x double> @vp_ceil_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -575,12 +575,12 @@ define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -596,12 +596,12 @@ define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %e
 define <4 x double> @vp_ceil_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -619,12 +619,12 @@ define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -640,12 +640,12 @@ define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %e
 define <8 x double> @vp_ceil_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -663,12 +663,12 @@ define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -684,12 +684,12 @@ define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroex
 define <15 x double> @vp_ceil_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -707,12 +707,12 @@ define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -728,12 +728,12 @@ define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroex
 define <16 x double> @vp_ceil_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -762,8 +762,8 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -778,33 +778,33 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 9d0d42cf754c5..99007aaa8a106 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1503,38 +1503,29 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1547,57 +1538,34 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1671,47 +1639,49 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1775,38 +1745,29 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1819,57 +1780,34 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1943,47 +1881,49 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -2055,7 +1995,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -2072,12 +2013,12 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a2, 16(sp)
 ; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
@@ -2087,7 +2028,6 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2102,34 +2042,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a3, a3, a5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a4), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
@@ -2137,38 +2068,41 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2180,61 +2114,37 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -2244,7 +2154,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -2266,18 +2177,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -2290,41 +2203,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2332,21 +2229,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -2372,9 +2268,9 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
@@ -2495,14 +2391,14 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB35_2
+; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
@@ -2550,76 +2446,58 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 16
-; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vor.vv v24, v16, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v16, a2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v16, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v16, v16, v0
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vnot.v v24, v24
+; RV32-NEXT:    vsrl.vi v0, v24, 1
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsub.vv v0, v16, v0
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v24, v24, v0
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a3), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -4213,38 +4091,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4257,57 +4126,34 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -4381,47 +4227,49 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4483,38 +4331,29 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4527,57 +4366,34 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -4651,47 +4467,49 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vx v0, v8, a1
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v16
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4761,7 +4579,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -4778,12 +4597,12 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a2, 16(sp)
 ; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
@@ -4793,7 +4612,6 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4808,34 +4626,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a3, a3, a5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a4), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
@@ -4843,38 +4652,41 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -4886,61 +4698,37 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4950,7 +4738,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -4972,18 +4761,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -4996,41 +4787,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -5038,21 +4813,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -5078,9 +4852,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
@@ -5201,14 +4975,14 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB71_2
+; RV32-NEXT:    bltu a0, a2, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
@@ -5256,76 +5030,58 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 16
-; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v0, 1
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vor.vv v24, v16, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v16, a2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v16, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v16, v16, v0
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vnot.v v24, v24
+; RV32-NEXT:    vsrl.vi v0, v24, 1
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vsub.vv v0, v16, v0
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vsub.vv v24, v24, v0
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a3), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index a5a1061842427..dea0ebfd56946 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1119,70 +1119,55 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
 define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1248,26 +1233,28 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1318,70 +1305,55 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
 define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctpop_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1447,26 +1419,28 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
@@ -1520,17 +1494,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
+; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -1542,102 +1517,118 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a2, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
-; RV32-NEXT:    addi a2, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1645,51 +1636,83 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -1710,9 +1733,9 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
@@ -1792,12 +1815,9 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -1809,135 +1829,103 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB35_2
+; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vsrl.vi v24, v8, 1
 ; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v8, v24
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v24, 1
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v0
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    addi a2, sp, 24
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vadd.vv v16, v24, v16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a2), zero
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v24, 4
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v24, v8
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v24, v0
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v0
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
+; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index 4fbe67cfcd642..a39fc835f9d85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -167,8 +167,6 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ;
 ; RV64-LABEL: ctpop_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
@@ -185,6 +183,8 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    add a3, a3, a5
 ; RV64-NEXT:    slli a5, a4, 32
 ; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
@@ -473,8 +473,6 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ;
 ; RV64-LABEL: ctpop_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
@@ -491,6 +489,8 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    add a3, a3, a5
 ; RV64-NEXT:    slli a5, a4, 32
 ; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 5f275da1740cb..093ddc36bf7f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1263,91 +1263,59 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
 define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1419,29 +1387,31 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v0, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1495,91 +1465,59 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
 define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1651,29 +1589,31 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v0, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1730,18 +1670,17 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
+; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -1753,12 +1692,12 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a2, 16(sp)
 ; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
@@ -1771,95 +1710,116 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -1867,84 +1827,88 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -1965,9 +1929,9 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
@@ -2051,45 +2015,41 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB35_2
+; RV32-NEXT:    bltu a0, a2, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v8, v8, a2
 ; RV32-NEXT:    vand.vv v8, v0, v8
@@ -2100,59 +2060,50 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vand.vv v24, v0, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v16, v8
+; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    mv a3, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a2), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v0
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v0
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vsrl.vx v16, v16, a2
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -3460,91 +3411,59 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -3616,29 +3535,31 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v0, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3689,92 +3610,60 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 
 define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v16i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
+; RV32-NEXT:    sw a1, 16(sp)
+; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -3846,29 +3735,31 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v0, v16, v24
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3923,18 +3814,17 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v7, v0, 2
+; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -3946,12 +3836,12 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sw a2, 36(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 16
-; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    sw a2, 16(sp)
 ; RV32-NEXT:    sw a2, 20(sp)
+; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
@@ -3964,95 +3854,116 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
-; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4060,84 +3971,88 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -4158,9 +4073,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
@@ -4244,45 +4159,41 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 32(sp)
-; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    addi a1, a2, 257
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a3, .LBB71_2
+; RV32-NEXT:    bltu a0, a2, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vnot.v v0, v8
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v8, v8, a2
 ; RV32-NEXT:    vand.vv v8, v0, v8
@@ -4293,59 +4204,50 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1
-; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vand.vv v24, v0, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v16, v8
+; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    mv a3, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a2), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v0, (a3), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v0
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v0
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    vsrl.vx v16, v16, a2
+; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 57e0eeb92ee2f..ddf92af2312cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -45,9 +45,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v10, v12, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v9, v10, 0
+; RVF-NEXT:    vsub.vx v9, v9, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v9, a1
-; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVF-NEXT:    vmerge.vim v8, v9, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -64,9 +64,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v10, v12, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v9, v10, 0
+; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v9, a1
-; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVD-NEXT:    vmerge.vim v8, v9, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -390,10 +390,10 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-LABEL: cttz_v32i8:
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    li a1, 32
+; RVI-NEXT:    li a2, 1
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
-; RVI-NEXT:    li a1, 1
-; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    vsub.vx v10, v8, a2
 ; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
@@ -425,9 +425,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v12, v16, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v12, 0
+; RVF-NEXT:    vsub.vx v10, v10, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
-; RVF-NEXT:    vsub.vx v8, v10, a1
-; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVF-NEXT:    vmerge.vim v8, v10, 8, v0
 ; RVF-NEXT:    vse8.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -445,9 +445,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v12, v16, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v12, 0
+; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
-; RVD-NEXT:    vsub.vx v8, v10, a1
-; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
+; RVD-NEXT:    vmerge.vim v8, v10, 8, v0
 ; RVD-NEXT:    vse8.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -1121,10 +1121,10 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-LABEL: cttz_zero_undef_v32i8:
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    li a1, 32
+; RVI-NEXT:    li a2, 1
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
-; RVI-NEXT:    li a1, 1
-; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    vsub.vx v10, v8, a2
 ; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index b4634dbf5a5e8..b611fcd9ddb33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -22,10 +22,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
+; CHECK-NEXT:    vadd.vi v11, v11, -15
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v11, v11, -15
 ; CHECK-NEXT:    vmerge.vim v13, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index e13f4f4b50b0f..76e1ae0a69c24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,26 +26,26 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a3, 4(a1)
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    lw a6, 8(a0)
-; RV32-NEXT:    lw a7, 12(a0)
-; RV32-NEXT:    lw t0, 12(a1)
-; RV32-NEXT:    lw a1, 8(a1)
-; RV32-NEXT:    add a3, a5, a3
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    sltu a4, a2, a4
-; RV32-NEXT:    sltu a5, a1, a6
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a1, 8(a0)
-; RV32-NEXT:    sw a5, 12(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 8(a0)
+; RV32-NEXT:    lw a5, 12(a0)
+; RV32-NEXT:    lw a6, 0(a1)
+; RV32-NEXT:    lw a7, 4(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    add a3, a3, a7
+; RV32-NEXT:    add a6, a2, a6
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    add t0, a4, t0
+; RV32-NEXT:    sltu a2, a6, a2
+; RV32-NEXT:    sltu a4, t0, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    add a1, a1, a4
+; RV32-NEXT:    sw a6, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    sw t0, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
@@ -89,14 +89,14 @@ define void @add_v1i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lw a2, 0(a0)
 ; RV32-NEXT:    lw a3, 4(a0)
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    sltu a2, a1, a2
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a4, a2, a4
+; RV32-NEXT:    sltu a2, a4, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    sw a4, 0(a0)
+; RV32-NEXT:    sw a1, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v1i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
index bb2b57fbcc3b7..54489765cff1a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
@@ -308,9 +308,9 @@ define void @truncstore_v2i8_v2i1(<2 x i8> %x, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index e53876d69b59b..b350268a3c10c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -10,9 +10,9 @@ define i1 @extractelt_v1i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.s.x v9, zero
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -27,9 +27,9 @@ define i1 @extractelt_v2i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -44,9 +44,9 @@ define i1 @extractelt_v4i1(ptr %x, i64 %idx) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -328,13 +328,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    mv a2, sp
 ; RV32-NEXT:    li a3, 128
 ; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    vmseq.vi v0, v8, 0
+; RV32-NEXT:    vle8.v v24, (a0)
+; RV32-NEXT:    vmseq.vi v8, v24, 0
 ; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmseq.vi v8, v16, 0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
 ; RV32-NEXT:    vmerge.vim v16, v24, 1, v0
 ; RV32-NEXT:    vse8.v v16, (a2)
 ; RV32-NEXT:    vmv1r.v v0, v8
@@ -359,13 +359,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    mv a2, sp
 ; RV64-NEXT:    li a3, 128
 ; RV64-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; RV64-NEXT:    vle8.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle8.v v16, (a0)
+; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vle8.v v24, (a0)
+; RV64-NEXT:    vmseq.vi v8, v24, 0
 ; RV64-NEXT:    vmv.v.i v24, 0
-; RV64-NEXT:    vmseq.vi v8, v16, 0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
 ; RV64-NEXT:    vmerge.vim v16, v24, 1, v0
 ; RV64-NEXT:    vse8.v v16, (a2)
 ; RV64-NEXT:    vmv1r.v v0, v8
@@ -390,13 +390,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    mv a2, sp
 ; RV32ZBS-NEXT:    li a3, 128
 ; RV32ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; RV32ZBS-NEXT:    vle8.v v8, (a0)
-; RV32ZBS-NEXT:    addi a0, a0, 128
 ; RV32ZBS-NEXT:    vle8.v v16, (a0)
+; RV32ZBS-NEXT:    addi a0, a0, 128
 ; RV32ZBS-NEXT:    add a1, a2, a1
-; RV32ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV32ZBS-NEXT:    vle8.v v24, (a0)
+; RV32ZBS-NEXT:    vmseq.vi v8, v24, 0
 ; RV32ZBS-NEXT:    vmv.v.i v24, 0
-; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
+; RV32ZBS-NEXT:    vmseq.vi v0, v16, 0
 ; RV32ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
 ; RV32ZBS-NEXT:    vse8.v v16, (a2)
 ; RV32ZBS-NEXT:    vmv1r.v v0, v8
@@ -421,13 +421,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    mv a2, sp
 ; RV64ZBS-NEXT:    li a3, 128
 ; RV64ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; RV64ZBS-NEXT:    vle8.v v8, (a0)
-; RV64ZBS-NEXT:    addi a0, a0, 128
 ; RV64ZBS-NEXT:    vle8.v v16, (a0)
+; RV64ZBS-NEXT:    addi a0, a0, 128
 ; RV64ZBS-NEXT:    add a1, a2, a1
-; RV64ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV64ZBS-NEXT:    vle8.v v24, (a0)
+; RV64ZBS-NEXT:    vmseq.vi v8, v24, 0
 ; RV64ZBS-NEXT:    vmv.v.i v24, 0
-; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
+; RV64ZBS-NEXT:    vmseq.vi v0, v16, 0
 ; RV64ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
 ; RV64ZBS-NEXT:    vse8.v v16, (a2)
 ; RV64ZBS-NEXT:    vmv1r.v v0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index e9dca2c42e835..c7370102be738 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -560,12 +560,13 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 ; VLA-NEXT:    vlm.v v0, (a0)
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
 ; VLA-NEXT:    vslidedown.vi v8, v8, 2
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLA-NEXT:    vmsne.vi v0, v8, 0
-; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vmerge.vim v8, v9, 1, v0
 ; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -581,12 +582,13 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 ; VLS-NEXT:    vlm.v v0, (a0)
 ; VLS-NEXT:    vmv.v.i v8, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
 ; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
 ; VLS-NEXT:    vslidedown.vi v8, v8, 2
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLS-NEXT:    vmsne.vi v0, v8, 0
-; VLS-NEXT:    vmv.v.i v8, 0
-; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
 ; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLS-NEXT:    vmv.v.i v9, 0
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -610,12 +612,13 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; VLA-NEXT:    li a0, 42
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v12, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLA-NEXT:    vmsne.vi v0, v8, 0
-; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vmerge.vim v8, v12, 1, v0
 ; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -631,11 +634,12 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; VLS-NEXT:    vlm.v v0, (a0)
 ; VLS-NEXT:    vmv.v.i v8, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
-; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; VLS-NEXT:    vslidedown.vi v8, v10, 10
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; VLS-NEXT:    vmsne.vi v0, v8, 0
 ; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v9, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v9, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLS-NEXT:    vmv.v.i v9, 0
@@ -676,12 +680,13 @@ define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
 ; VLA-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; VLA-NEXT:    vslidedown.vi v8, v8, 2
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLA-NEXT:    vmsne.vi v0, v8, 0
-; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vmerge.vim v8, v9, 1, v0
 ; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -696,12 +701,13 @@ define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
 ; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; VLS-NEXT:    vmv.v.i v8, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; VLS-NEXT:    vslidedown.vi v8, v8, 2
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLS-NEXT:    vmsne.vi v0, v8, 0
-; VLS-NEXT:    vmv.v.i v8, 0
-; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
 ; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLS-NEXT:    vmv.v.i v9, 0
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -740,12 +746,13 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -766,12 +773,13 @@ define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    li a1, 42
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v12, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a1
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLA-NEXT:    vmsne.vi v0, v8, 0
-; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vmerge.vim v8, v12, 1, v0
 ; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -786,11 +794,12 @@ define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
 ; VLS-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; VLS-NEXT:    vmv.v.i v8, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
-; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; VLS-NEXT:    vslidedown.vi v8, v10, 10
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; VLS-NEXT:    vmsne.vi v0, v8, 0
 ; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v9, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v9, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLS-NEXT:    vmv.v.i v9, 0
@@ -811,12 +820,13 @@ define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
 ; VLA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v10, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
 ; VLA-NEXT:    vslidedown.vi v8, v8, 26
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; VLA-NEXT:    vmsne.vi v0, v8, 0
-; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
 ; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLA-NEXT:    vmv.v.i v9, 0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -831,11 +841,12 @@ define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
 ; VLS-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; VLS-NEXT:    vmv.v.i v8, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
-; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; VLS-NEXT:    vslidedown.vi v8, v9, 10
 ; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; VLS-NEXT:    vmsne.vi v0, v8, 0
 ; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v9, v9, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v9, 0
 ; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; VLS-NEXT:    vmv.v.i v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index 7e45136372b6c..f613449856e09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -626,11 +626,11 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32NOM-NEXT:    andi a0, a1, 31
 ; RV32NOM-NEXT:    li a1, 4
 ; RV32NOM-NEXT:    call __mulsi3
-; RV32NOM-NEXT:    li a1, 32
-; RV32NOM-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32NOM-NEXT:    vle32.v v8, (s2)
 ; RV32NOM-NEXT:    mv a1, sp
+; RV32NOM-NEXT:    li a2, 32
 ; RV32NOM-NEXT:    add a0, a1, a0
+; RV32NOM-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32NOM-NEXT:    vle32.v v8, (s2)
 ; RV32NOM-NEXT:    vadd.vv v8, v8, v8
 ; RV32NOM-NEXT:    vse32.v v8, (a1)
 ; RV32NOM-NEXT:    lw a0, 0(a0)
@@ -649,14 +649,14 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32M-NEXT:    addi s0, sp, 256
 ; RV32M-NEXT:    andi sp, sp, -128
 ; RV32M-NEXT:    andi a1, a1, 31
-; RV32M-NEXT:    li a2, 32
-; RV32M-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32M-NEXT:    vle32.v v8, (a0)
+; RV32M-NEXT:    mv a2, sp
+; RV32M-NEXT:    li a3, 32
 ; RV32M-NEXT:    slli a1, a1, 2
-; RV32M-NEXT:    mv a0, sp
-; RV32M-NEXT:    or a1, a0, a1
+; RV32M-NEXT:    or a1, a2, a1
+; RV32M-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32M-NEXT:    vle32.v v8, (a0)
 ; RV32M-NEXT:    vadd.vv v8, v8, v8
-; RV32M-NEXT:    vse32.v v8, (a0)
+; RV32M-NEXT:    vse32.v v8, (a2)
 ; RV32M-NEXT:    lw a0, 0(a1)
 ; RV32M-NEXT:    addi sp, s0, -256
 ; RV32M-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -676,11 +676,11 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64NOM-NEXT:    andi a0, a1, 31
 ; RV64NOM-NEXT:    li a1, 4
 ; RV64NOM-NEXT:    call __muldi3
-; RV64NOM-NEXT:    li a1, 32
-; RV64NOM-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64NOM-NEXT:    vle32.v v8, (s2)
 ; RV64NOM-NEXT:    mv a1, sp
+; RV64NOM-NEXT:    li a2, 32
 ; RV64NOM-NEXT:    add a0, a1, a0
+; RV64NOM-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV64NOM-NEXT:    vle32.v v8, (s2)
 ; RV64NOM-NEXT:    vadd.vv v8, v8, v8
 ; RV64NOM-NEXT:    vse32.v v8, (a1)
 ; RV64NOM-NEXT:    lw a0, 0(a0)
@@ -699,14 +699,14 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64M-NEXT:    addi s0, sp, 256
 ; RV64M-NEXT:    andi sp, sp, -128
 ; RV64M-NEXT:    andi a1, a1, 31
-; RV64M-NEXT:    li a2, 32
-; RV64M-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64M-NEXT:    vle32.v v8, (a0)
+; RV64M-NEXT:    mv a2, sp
+; RV64M-NEXT:    li a3, 32
 ; RV64M-NEXT:    slli a1, a1, 2
-; RV64M-NEXT:    mv a0, sp
-; RV64M-NEXT:    or a1, a0, a1
+; RV64M-NEXT:    or a1, a2, a1
+; RV64M-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV64M-NEXT:    vle32.v v8, (a0)
 ; RV64M-NEXT:    vadd.vv v8, v8, v8
-; RV64M-NEXT:    vse32.v v8, (a0)
+; RV64M-NEXT:    vse32.v v8, (a2)
 ; RV64M-NEXT:    lw a0, 0(a1)
 ; RV64M-NEXT:    addi sp, s0, -256
 ; RV64M-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index ab2d00b9b9137..c328d5fbe6b0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -10,11 +10,11 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -33,11 +33,11 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -56,11 +56,11 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -79,11 +79,11 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -102,11 +102,11 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -128,9 +128,9 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -151,9 +151,9 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -174,9 +174,9 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -197,9 +197,9 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -220,9 +220,9 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -243,9 +243,9 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -264,11 +264,11 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -287,11 +287,11 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -310,11 +310,11 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -333,11 +333,11 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index c6ce7c1bbe8b4..ebb75357cdfe7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -10,11 +10,11 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -33,11 +33,11 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -56,11 +56,11 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -79,11 +79,11 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -102,11 +102,11 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -128,9 +128,9 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -151,9 +151,9 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -174,9 +174,9 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -197,9 +197,9 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -220,9 +220,9 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -243,9 +243,9 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -264,11 +264,11 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -287,11 +287,11 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -310,11 +310,11 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -333,11 +333,11 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index d500469003aea..6536021da0313 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.floor.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -35,12 +35,12 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -59,12 +59,12 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -77,11 +77,11 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.floor.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -121,12 +121,12 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -145,12 +145,12 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,11 +163,11 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.floor.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -207,12 +207,12 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -231,12 +231,12 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -249,11 +249,11 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -273,12 +273,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -295,12 +295,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -319,12 +319,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -337,11 +337,11 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,9 +363,9 @@ define <2 x float> @vp_floor_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -384,8 +384,8 @@ define <2 x float> @vp_floor_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -405,9 +405,9 @@ define <4 x float> @vp_floor_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -426,8 +426,8 @@ define <4 x float> @vp_floor_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,9 +448,9 @@ define <8 x float> @vp_floor_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -470,8 +470,8 @@ define <8 x float> @vp_floor_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -492,9 +492,9 @@ define <16 x float> @vp_floor_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -514,8 +514,8 @@ define <16 x float> @vp_floor_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.floor.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -552,12 +552,12 @@ define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %
 define <2 x double> @vp_floor_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -575,12 +575,12 @@ define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -596,12 +596,12 @@ define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 define <4 x double> @vp_floor_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -619,12 +619,12 @@ define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -640,12 +640,12 @@ define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 define <8 x double> @vp_floor_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -663,12 +663,12 @@ define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -684,12 +684,12 @@ define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 define <15 x double> @vp_floor_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -707,12 +707,12 @@ define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -728,12 +728,12 @@ define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 define <16 x double> @vp_floor_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -762,8 +762,8 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -778,33 +778,33 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 4f11e6c3c386a..dc5e2e213f781 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -59,16 +59,14 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -127,16 +125,14 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -197,15 +193,13 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -269,15 +263,13 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -587,7 +579,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
@@ -601,29 +593,29 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB24_2
+; CHECK-NEXT:    bltu a2, a3, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    vmv8r.v v8, v16
@@ -680,10 +672,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index e17ad303eddb8..eeb9ba155764c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -24,16 +24,14 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-LABEL: vfmax_v2f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -57,16 +55,14 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-LABEL: vfmax_v4f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -90,15 +86,13 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-LABEL: vfmax_v8f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -123,15 +117,13 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-LABEL: vfmax_v16f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -295,8 +287,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmax_v2f16_vv_nnana:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vfadd.vv v8, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v10, v9, v8, v0
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vmerge.vvm v8, v8, v9, v0
@@ -332,8 +324,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmax_v2f16_vv_nnanb:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vfadd.vv v9, v9, v9
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 2e2103ad5e06d..546aa751c9c73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -59,16 +59,14 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -127,16 +125,14 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -197,15 +193,13 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -269,15 +263,13 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -587,7 +579,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
@@ -601,29 +593,29 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB24_2
+; CHECK-NEXT:    bltu a2, a3, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    vmv8r.v v8, v16
@@ -680,10 +672,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index 1362055c4dabf..196915bf141d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -24,16 +24,14 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN-LABEL: vfmin_v2f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -57,16 +55,14 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
 ; ZVFHMIN-LABEL: vfmin_v4f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -90,15 +86,13 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
 ; ZVFHMIN-LABEL: vfmin_v8f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -123,15 +117,13 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
 ; ZVFHMIN-LABEL: vfmin_v16f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -295,8 +287,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmin_v2f16_vv_nnana:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vfadd.vv v8, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v10, v9, v8, v0
 ; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vmerge.vvm v8, v8, v9, v0
@@ -332,8 +324,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFH-LABEL: vfmin_v2f16_vv_nnanb:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vfadd.vv v9, v9, v9
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFH-NEXT:    vmerge.vvm v10, v8, v9, v0
 ; ZVFH-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 3a7ded1537ef6..f192a053ac888 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -12,17 +12,17 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x half> %r
@@ -36,17 +36,17 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x half> %r
@@ -60,17 +60,17 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x half> %r
@@ -84,17 +84,17 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x half> %r
@@ -111,15 +111,15 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <32 x half> %r
@@ -135,15 +135,15 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %r
@@ -159,15 +159,15 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x float> %r
@@ -183,15 +183,15 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x float> %r
@@ -207,15 +207,15 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x float> %r
@@ -229,17 +229,17 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x double> %r
@@ -253,17 +253,17 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %r
@@ -277,17 +277,17 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index e82891f90d85e..4c0186e7d219c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -46,9 +46,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ; CHECK-NEXT:    vmadd.vx v14, a0, v12
 ; CHECK-NEXT:    li a0, 129
 ; CHECK-NEXT:    vmv.s.x v15, a0
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vcompress.vm v12, v8, v15
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v12, v10, v14, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1749,13 +1751,13 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
 ; CHECK-NEXT:    vfmv.v.f v9, fa4
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
 ; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
 ; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
 ; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
 ; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x float> poison, float %e0, i64 0
@@ -1800,13 +1802,13 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
 ; CHECK-NEXT:    vfmv.v.f v9, fa4
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
 ; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
 ; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
 ; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
 ; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x double> poison, double %e0, i64 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 4b09b571b9406..94c471c5a3638 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -54,9 +54,9 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
 ; RV32-V512-NEXT:    ret
@@ -66,8 +66,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
 ; RV64-V512-NEXT:    ret
@@ -253,8 +253,8 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-NEXT:    vzext.vf2 v8, v24
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vzext.vf2 v24, v0
-; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vsll.vx v8, v8, a0
+; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v24, v8, v0
 ; V128-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
index c14eae0b1de61..92374177d93e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
@@ -17,10 +17,10 @@ define void @fcmp_oeq_vv_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
@@ -45,10 +45,10 @@ define void @fcmp_oeq_vv_v8f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
@@ -173,10 +173,10 @@ define void @fcmp_olt_vv_v16f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
@@ -201,10 +201,10 @@ define void @fcmp_olt_vv_v16f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16_nonans:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
@@ -345,10 +345,10 @@ define void @fcmp_ule_vv_v32f16_nonans(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a3, 32
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v12, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v12, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v24, v16
 ; ZVFHMIN-NEXT:    vsm.v v8, (a2)
@@ -535,11 +535,11 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fcmp_ord_vv_v4f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a1)
-; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vmfeq.vv v8, v8, v8
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vle16.v v9, (a1)
 ; ZVFH-NEXT:    vmfeq.vv v9, v9, v9
-; ZVFH-NEXT:    vmand.mm v0, v9, v8
+; ZVFH-NEXT:    vmfeq.vv v8, v8, v8
+; ZVFH-NEXT:    vmand.mm v0, v8, v9
 ; ZVFH-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFH-NEXT:    vmv.v.i v8, 0
 ; ZVFH-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -555,14 +555,14 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_ord_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v8, v8
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -585,11 +585,11 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fcmp_uno_vv_v4f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT:    vle16.v v8, (a1)
-; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vmfne.vv v8, v8, v8
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vle16.v v9, (a1)
 ; ZVFH-NEXT:    vmfne.vv v9, v9, v9
-; ZVFH-NEXT:    vmor.mm v0, v9, v8
+; ZVFH-NEXT:    vmfne.vv v8, v8, v8
+; ZVFH-NEXT:    vmor.mm v0, v8, v9
 ; ZVFH-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFH-NEXT:    vmv.v.i v8, 0
 ; ZVFH-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -605,14 +605,14 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fcmp_uno_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmfne.vv v8, v8, v8
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -692,12 +692,13 @@ define void @fcmp_oeq_vf_v8f16_nonans(ptr %x, half %y, ptr %z) {
 define void @fcmp_une_vf_v4f32(ptr %x, float %y, ptr %z) {
 ; CHECK-LABEL: fcmp_une_vf_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
@@ -717,12 +718,13 @@ define void @fcmp_une_vf_v4f32(ptr %x, float %y, ptr %z) {
 define void @fcmp_une_vf_v4f32_nonans(ptr %x, float %y, ptr %z) {
 ; CHECK-LABEL: fcmp_une_vf_v4f32_nonans:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
@@ -742,12 +744,13 @@ define void @fcmp_une_vf_v4f32_nonans(ptr %x, float %y, ptr %z) {
 define void @fcmp_ogt_vf_v2f64(ptr %x, double %y, ptr %z) {
 ; CHECK-LABEL: fcmp_ogt_vf_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -767,12 +770,13 @@ define void @fcmp_ogt_vf_v2f64(ptr %x, double %y, ptr %z) {
 define void @fcmp_ogt_vf_v2f64_nonans(ptr %x, double %y, ptr %z) {
 ; CHECK-LABEL: fcmp_ogt_vf_v2f64_nonans:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -1333,12 +1337,13 @@ define void @fcmp_oeq_fv_v8f16_nonans(ptr %x, half %y, ptr %z) {
 define void @fcmp_une_fv_v4f32(ptr %x, float %y, ptr %z) {
 ; CHECK-LABEL: fcmp_une_fv_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
@@ -1358,12 +1363,13 @@ define void @fcmp_une_fv_v4f32(ptr %x, float %y, ptr %z) {
 define void @fcmp_une_fv_v4f32_nonans(ptr %x, float %y, ptr %z) {
 ; CHECK-LABEL: fcmp_une_fv_v4f32_nonans:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmfne.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
@@ -1383,12 +1389,13 @@ define void @fcmp_une_fv_v4f32_nonans(ptr %x, float %y, ptr %z) {
 define void @fcmp_ogt_fv_v2f64(ptr %x, double %y, ptr %z) {
 ; CHECK-LABEL: fcmp_ogt_fv_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
@@ -1408,12 +1415,13 @@ define void @fcmp_ogt_fv_v2f64(ptr %x, double %y, ptr %z) {
 define void @fcmp_ogt_fv_v2f64_nonans(ptr %x, double %y, ptr %z) {
 ; CHECK-LABEL: fcmp_ogt_fv_v2f64_nonans:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 41d8abb9b73eb..8e288fec53778 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -41,10 +41,10 @@ define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
 define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: shuffle_fv_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 9
+; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
 ; CHECK-NEXT:    ret
@@ -55,10 +55,10 @@ define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) {
 define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: shuffle_vf_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
 ; CHECK-NEXT:    ret
@@ -105,11 +105,12 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI7_0)
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v14, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 8
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 8
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v12, v10, 1, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -120,14 +121,16 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: vrgather_shuffle_xv_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI8_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI8_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI8_0)(a0)
 ; CHECK-NEXT:    vrsub.vi v12, v10, 4
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa5
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -138,16 +141,16 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: vrgather_shuffle_vx_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 9
+; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vcompress.vm v12, v8, v10
+; CHECK-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
   %s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
@@ -331,8 +334,8 @@ define <4 x bfloat> @vrgather_shuffle_vv_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI25_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -375,8 +378,8 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI28_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -399,18 +402,18 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
 define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
 ; CHECK-LABEL: shuffle_disjoint_lanes:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI30_0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI30_0)
 ; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsext.vf2 v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
   ret <16 x float> %out
@@ -437,12 +440,12 @@ define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x
 ; CHECK-NEXT:    lui a0, %hi(.LCPI32_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI32_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    vrgather.vi v16, v8, 7
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 15
 ; CHECK-NEXT:    addi a0, a0, 240
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vi v16, v8, 7
-; CHECK-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
@@ -452,14 +455,14 @@ define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x
 define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) {
 ; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI33_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI33_0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vle16.v v16, (a0)
 ; CHECK-NEXT:    lui a0, 15
 ; CHECK-NEXT:    addi a0, a0, 240
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vfmv.v.f v12, fa0
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll
index 58b0a17cdccd6..fed76227a2b69 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll
@@ -53,8 +53,8 @@ define void @gather_const_v2f64(ptr %x) {
 define void @gather_const_v64f16(ptr %x) {
 ; CHECK-LABEL: gather_const_v64f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    flh fa5, 94(a0)
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    flh fa5, 94(a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -70,8 +70,8 @@ define void @gather_const_v64f16(ptr %x) {
 define void @gather_const_v32f32(ptr %x) {
 ; CHECK-LABEL: gather_const_v32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    flw fa5, 68(a0)
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    flw fa5, 68(a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NEXT:    vse32.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 585a331e55094..86c727199bbae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -9,10 +9,10 @@ define void @fadd_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -30,10 +30,10 @@ define void @fadd_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -60,10 +60,10 @@ define void @fadd_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -90,10 +90,10 @@ define void @fadd_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -143,10 +143,10 @@ define void @fsub_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -164,10 +164,10 @@ define void @fsub_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -194,10 +194,10 @@ define void @fsub_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -224,10 +224,10 @@ define void @fsub_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -277,10 +277,10 @@ define void @fmul_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -298,10 +298,10 @@ define void @fmul_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -328,10 +328,10 @@ define void @fmul_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -358,10 +358,10 @@ define void @fmul_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -411,10 +411,10 @@ define void @fdiv_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -432,10 +432,10 @@ define void @fdiv_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -462,10 +462,10 @@ define void @fdiv_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -492,10 +492,10 @@ define void @fdiv_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v8, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -757,13 +757,13 @@ define void @copysign_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
@@ -777,13 +777,13 @@ define void @copysign_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
@@ -806,13 +806,13 @@ define void @copysign_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: copysign_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
@@ -835,13 +835,13 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: copysign_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
-; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -1023,14 +1023,14 @@ define void @copysign_neg_v8bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_neg_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v9, v9, a2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vxor.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
@@ -1045,14 +1045,14 @@ define void @copysign_neg_v6bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_neg_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vand.vx v9, v9, a2
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vxor.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
@@ -1076,14 +1076,14 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: copysign_neg_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
@@ -1107,14 +1107,14 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: copysign_neg_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
@@ -1211,10 +1211,10 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v4f16_v4f32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFH-NEXT:    vle32.v v8, (a1)
-; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vle32.v v9, (a1)
+; ZVFH-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFH-NEXT:    vfsgnjn.vv v8, v8, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1245,10 +1245,10 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
-; ZVFH-NEXT:    vle32.v v8, (a1)
-; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vle32.v v9, (a1)
+; ZVFH-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFH-NEXT:    vfsgnjn.vv v8, v8, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
 ;
@@ -1279,11 +1279,11 @@ define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_neg_ext_v2f64_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vfsgnjn.vv v8, v9, v10
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v10
 ; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
@@ -1417,17 +1417,17 @@ define void @fma_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -1441,17 +1441,17 @@ define void @fma_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -1475,17 +1475,17 @@ define void @fma_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -1509,17 +1509,17 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -1569,19 +1569,19 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vxor.vx v10, v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v10, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x bfloat>, ptr %x
   %b = load <8 x bfloat>, ptr %y
@@ -1596,19 +1596,19 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vxor.vx v10, v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v12, v14
+; CHECK-NEXT:    vfmadd.vv v10, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
-; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <6 x bfloat>, ptr %x
   %b = load <6 x bfloat>, ptr %y
@@ -1633,19 +1633,19 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = load <8 x half>, ptr %y
@@ -1670,19 +1670,19 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vle16.v v10, (a2)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT:    vse16.v v10, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <6 x half>, ptr %x
   %b = load <6 x half>, ptr %y
@@ -1736,10 +1736,10 @@ define void @fadd_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1766,10 +1766,10 @@ define void @fadd_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fadd_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1819,10 +1819,10 @@ define void @fsub_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fsub_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfsub.vv v8, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1849,10 +1849,10 @@ define void @fsub_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fsub_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1902,10 +1902,10 @@ define void @fmul_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fmul_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1932,10 +1932,10 @@ define void @fmul_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fmul_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1985,10 +1985,10 @@ define void @fdiv_v16bf16(ptr %x, ptr %y) {
 ; CHECK-LABEL: fdiv_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v8, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -2015,10 +2015,10 @@ define void @fdiv_v16f16(ptr %x, ptr %y) {
 ; ZVFHMIN-LABEL: fdiv_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -2134,17 +2134,17 @@ define void @fma_v16bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v12, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v20, v16
+; CHECK-NEXT:    vfmadd.vv v20, v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
-; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <16 x bfloat>, ptr %x
   %b = load <16 x bfloat>, ptr %y
@@ -2168,17 +2168,17 @@ define void @fma_v16f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fma_v16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a2)
-; ZVFHMIN-NEXT:    vle16.v v10, (a0)
-; ZVFHMIN-NEXT:    vle16.v v12, (a1)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v10, (a1)
+; ZVFHMIN-NEXT:    vle16.v v12, (a2)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
-; ZVFHMIN-NEXT:    vse16.v v12, (a0)
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = load <16 x half>, ptr %y
@@ -3347,13 +3347,13 @@ define void @fdiv_fv_v2f64(ptr %x, double %y) {
 define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_vf_v8bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v14, v12
@@ -3373,13 +3373,13 @@ define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 define void @fma_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_vf_v6bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v14, v12
@@ -3408,13 +3408,13 @@ define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-LABEL: fma_vf_v8f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
@@ -3443,13 +3443,13 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-LABEL: fma_vf_v6f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
@@ -3505,13 +3505,13 @@ define void @fma_vf_v2f64(ptr %x, ptr %y, double %z) {
 define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_fv_v8bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v14, v12
@@ -3531,13 +3531,13 @@ define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) {
 define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-LABEL: fma_fv_v6bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    fmv.x.w a1, fa0
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v14, v12
@@ -3566,13 +3566,13 @@ define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-LABEL: fma_fv_v8f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
@@ -3601,13 +3601,13 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) {
 ;
 ; ZVFHMIN-LABEL: fma_fv_v6f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
@@ -3665,13 +3665,13 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vxor.vx v9, v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v14
@@ -3694,13 +3694,13 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.w a2, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT:    vxor.vx v9, v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v12, v14
@@ -3732,13 +3732,13 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
@@ -3770,13 +3770,13 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.w a2, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
@@ -4057,11 +4057,11 @@ define void @ceil_v8bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4084,12 +4084,12 @@ define void @ceil_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -4113,9 +4113,9 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI177_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI177_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 3
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 3
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4130,11 +4130,11 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4157,10 +4157,10 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI178_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI178_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 3
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 3
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
@@ -4176,12 +4176,12 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
@@ -4205,9 +4205,9 @@ define void @ceil_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4228,9 +4228,9 @@ define void @ceil_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI180_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI180_0)(a1)
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4251,11 +4251,11 @@ define void @floor_v8bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4278,12 +4278,12 @@ define void @floor_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -4307,9 +4307,9 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI183_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI183_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 2
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 2
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4324,11 +4324,11 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4351,10 +4351,10 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI184_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI184_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 2
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 2
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
@@ -4370,12 +4370,12 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
@@ -4399,9 +4399,9 @@ define void @floor_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4422,9 +4422,9 @@ define void @floor_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI186_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI186_0)(a1)
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4445,11 +4445,11 @@ define void @round_v8bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4472,12 +4472,12 @@ define void @round_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
@@ -4501,9 +4501,9 @@ define void @round_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI189_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI189_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 4
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 4
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4518,11 +4518,11 @@ define void @round_v8f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4545,10 +4545,10 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI190_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI190_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 4
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    fsrmi a1, 4
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
@@ -4564,12 +4564,12 @@ define void @round_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
@@ -4593,9 +4593,9 @@ define void @round_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4616,9 +4616,9 @@ define void @round_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI192_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI192_0)(a1)
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4746,11 +4746,11 @@ define void @nearbyint_v8bf16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    fsflags a1
@@ -4773,9 +4773,9 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI198_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI198_0)(a1)
+; ZVFH-NEXT:    frflags a1
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
-; ZVFH-NEXT:    frflags a1
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; ZVFH-NEXT:    fsflags a1
@@ -4790,11 +4790,11 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    frflags a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    fsflags a1
@@ -4817,9 +4817,9 @@ define void @nearbyint_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    fsflags a1
@@ -4840,9 +4840,9 @@ define void @nearbyint_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI200_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI200_0)(a1)
+; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    fsflags a1
@@ -4860,11 +4860,11 @@ define void @fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v14, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -4889,11 +4889,11 @@ define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v14, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -4928,11 +4928,11 @@ define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmuladd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -4967,11 +4967,11 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmuladd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -5032,11 +5032,11 @@ define void @fmsub_fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_fmuladd_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v14, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -5062,11 +5062,11 @@ define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmsub_fmuladd_v6bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmul.vv v8, v14, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -5102,11 +5102,11 @@ define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_fmuladd_v8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -5142,11 +5142,11 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vle16.v v8, (a1)
-; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vle16.v v9, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v10, (a2)
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v8, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index a1466d46f1ba7..5106ec1189327 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -96,9 +96,9 @@ declare <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float>, <32 x i1>, i32)
 define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vfpext_v32f32_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
index c6b8b602718b7..c18d8639dc91c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll
@@ -743,8 +743,8 @@ define <16 x float> @powi_v16f32(<16 x float> %x, i32 %y) nounwind {
 ; RV64-NEXT:    addi a1, sp, 64
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    flw fa0, 124(sp)
 ; RV64-NEXT:    sext.w s2, a0
+; RV64-NEXT:    flw fa0, 124(sp)
 ; RV64-NEXT:    mv a0, s2
 ; RV64-NEXT:    call __powisf2
 ; RV64-NEXT:    fsw fa0, 188(sp)
@@ -1188,8 +1188,8 @@ define <8 x double> @powi_v8f64(<8 x double> %x, i32 %y) nounwind {
 ; RV64-NEXT:    addi a1, sp, 64
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a1)
-; RV64-NEXT:    fld fa0, 120(sp)
 ; RV64-NEXT:    sext.w s2, a0
+; RV64-NEXT:    fld fa0, 120(sp)
 ; RV64-NEXT:    mv a0, s2
 ; RV64-NEXT:    call __powidf2
 ; RV64-NEXT:    fsd fa0, 184(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index f6c992280c6e3..e4609f1e9313d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -394,9 +394,9 @@ declare <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_v32i64_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index af225f4d95aa2..846675cf5a9b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -394,9 +394,9 @@ declare <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_v32i64_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index 582706e4dfa18..ae53abc3f8c9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -99,8 +99,8 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v12, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index be32c033fe373..751a6e45c0c3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
@@ -12,11 +12,11 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -35,11 +35,11 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -58,11 +58,11 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -81,11 +81,11 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -104,11 +104,11 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -130,9 +130,9 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -153,9 +153,9 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -176,9 +176,9 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -199,9 +199,9 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -222,9 +222,9 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -245,9 +245,9 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -266,11 +266,11 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -289,11 +289,11 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -312,11 +312,11 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -335,11 +335,11 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
index 774ce5c7859c9..2bf3e9596597d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
@@ -13,12 +13,12 @@
 define <1 x half> @round_v1f16(<1 x half> %x) {
 ; ZVFH-LABEL: round_v1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,11 +31,11 @@ define <1 x half> @round_v1f16(<1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -52,12 +52,12 @@ declare <1 x half> @llvm.round.v1f16(<1 x half>)
 define <2 x half> @round_v2f16(<2 x half> %x) {
 ; ZVFH-LABEL: round_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -70,11 +70,11 @@ define <2 x half> @round_v2f16(<2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -91,12 +91,12 @@ declare <2 x half> @llvm.round.v2f16(<2 x half>)
 define <4 x half> @round_v4f16(<4 x half> %x) {
 ; ZVFH-LABEL: round_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -109,11 +109,11 @@ define <4 x half> @round_v4f16(<4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ declare <4 x half> @llvm.round.v4f16(<4 x half>)
 define <8 x half> @round_v8f16(<8 x half> %x) {
 ; ZVFH-LABEL: round_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -148,11 +148,11 @@ define <8 x half> @round_v8f16(<8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -169,12 +169,12 @@ declare <8 x half> @llvm.round.v8f16(<8 x half>)
 define <16 x half> @round_v16f16(<16 x half> %x) {
 ; ZVFH-LABEL: round_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -187,11 +187,11 @@ define <16 x half> @round_v16f16(<16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -208,15 +208,15 @@ declare <16 x half> @llvm.round.v16f16(<16 x half>)
 define <32 x half> @round_v32f16(<32 x half> %x) {
 ; ZVFH-LABEL: round_v32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    li a0, 32
+; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 4
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; ZVFH-NEXT:    fsrm a0
+; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
@@ -226,15 +226,15 @@ define <32 x half> @round_v32f16(<32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
 ; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    fsrm a0
+; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
@@ -253,8 +253,8 @@ define <1 x float> @round_v1f32(<1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,8 +273,8 @@ define <2 x float> @round_v2f32(<2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -293,8 +293,8 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -313,8 +313,8 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -333,8 +333,8 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -349,12 +349,12 @@ declare <16 x float> @llvm.round.v16f32(<16 x float>)
 define <1 x double> @round_v1f64(<1 x double> %x) {
 ; CHECK-LABEL: round_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -369,12 +369,12 @@ declare <1 x double> @llvm.round.v1f64(<1 x double>)
 define <2 x double> @round_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: round_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -389,12 +389,12 @@ declare <2 x double> @llvm.round.v2f64(<2 x double>)
 define <4 x double> @round_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: round_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -409,12 +409,12 @@ declare <4 x double> @llvm.round.v4f64(<4 x double>)
 define <8 x double> @round_v8f64(<8 x double> %x) {
 ; CHECK-LABEL: round_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 5c0279e133dfa..c61e707bd89f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -12,11 +12,11 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -35,11 +35,11 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -58,11 +58,11 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -81,11 +81,11 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -104,11 +104,11 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -130,9 +130,9 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -153,9 +153,9 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -176,9 +176,9 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -199,9 +199,9 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -222,9 +222,9 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -245,9 +245,9 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -266,11 +266,11 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -289,11 +289,11 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -312,11 +312,11 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -335,11 +335,11 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
index 0b6baad127643..697fc657af5d1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
@@ -13,12 +13,12 @@
 define <1 x half> @roundeven_v1f16(<1 x half> %x) {
 ; ZVFH-LABEL: roundeven_v1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,11 +31,11 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -52,12 +52,12 @@ declare <1 x half> @llvm.roundeven.v1f16(<1 x half>)
 define <2 x half> @roundeven_v2f16(<2 x half> %x) {
 ; ZVFH-LABEL: roundeven_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -70,11 +70,11 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -91,12 +91,12 @@ declare <2 x half> @llvm.roundeven.v2f16(<2 x half>)
 define <4 x half> @roundeven_v4f16(<4 x half> %x) {
 ; ZVFH-LABEL: roundeven_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -109,11 +109,11 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ declare <4 x half> @llvm.roundeven.v4f16(<4 x half>)
 define <8 x half> @roundeven_v8f16(<8 x half> %x) {
 ; ZVFH-LABEL: roundeven_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -148,11 +148,11 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -169,12 +169,12 @@ declare <8 x half> @llvm.roundeven.v8f16(<8 x half>)
 define <16 x half> @roundeven_v16f16(<16 x half> %x) {
 ; ZVFH-LABEL: roundeven_v16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -187,11 +187,11 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -208,15 +208,15 @@ declare <16 x half> @llvm.roundeven.v16f16(<16 x half>)
 define <32 x half> @roundeven_v32f16(<32 x half> %x) {
 ; ZVFH-LABEL: roundeven_v32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    li a0, 32
+; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT:    fsrmi a1, 0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; ZVFH-NEXT:    fsrm a0
+; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
@@ -226,15 +226,15 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
 ; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
+; ZVFHMIN-NEXT:    fsrmi a1, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    fsrm a0
+; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
@@ -253,8 +253,8 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,8 +273,8 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -293,8 +293,8 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -313,8 +313,8 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -333,8 +333,8 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -349,12 +349,12 @@ declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
 define <1 x double> @roundeven_v1f64(<1 x double> %x) {
 ; CHECK-LABEL: roundeven_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -369,12 +369,12 @@ declare <1 x double> @llvm.roundeven.v1f64(<1 x double>)
 define <2 x double> @roundeven_v2f64(<2 x double> %x) {
 ; CHECK-LABEL: roundeven_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -389,12 +389,12 @@ declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
 define <4 x double> @roundeven_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: roundeven_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -409,12 +409,12 @@ declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
 define <8 x double> @roundeven_v8f64(<8 x double> %x) {
 ; CHECK-LABEL: roundeven_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 62e7e3b109902..82d740d3113eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -285,14 +285,14 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_2:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vle32.v v8, (a0)
 ; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLA-NEXT:    vle32.v v10, (a1)
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
 ; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; VLA-NEXT:    vslideup.vi v8, v10, 2
+; VLA-NEXT:    vslideup.vi v10, v8, 2
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    vse32.v v10, (a0)
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: insert_v8i32_v2i32_2:
@@ -314,13 +314,12 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_6:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vle32.v v8, (a0)
 ; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLA-NEXT:    vle32.v v10, (a1)
+; VLA-NEXT:    vle32.v v8, (a1)
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v10, 6
-; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: insert_v8i32_v2i32_6:
@@ -830,13 +829,13 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32VLS-NEXT:    vl1re64.v v8, (a0)
 ; RV32VLS-NEXT:    addi a0, sp, 128
 ; RV32VLS-NEXT:    vs1r.v v8, (a0)
-; RV32VLS-NEXT:    addi a0, sp, 192
-; RV32VLS-NEXT:    vl8re64.v v8, (a0)
 ; RV32VLS-NEXT:    addi a0, sp, 64
+; RV32VLS-NEXT:    vl8re64.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 192
 ; RV32VLS-NEXT:    vl8re64.v v16, (a0)
 ; RV32VLS-NEXT:    addi a0, a1, 128
-; RV32VLS-NEXT:    vs8r.v v8, (a0)
-; RV32VLS-NEXT:    vs8r.v v16, (a1)
+; RV32VLS-NEXT:    vs8r.v v16, (a0)
+; RV32VLS-NEXT:    vs8r.v v8, (a1)
 ; RV32VLS-NEXT:    addi sp, s0, -80
 ; RV32VLS-NEXT:    .cfi_def_cfa sp, 80
 ; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -862,13 +861,13 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64VLS-NEXT:    vl1re64.v v8, (a0)
 ; RV64VLS-NEXT:    addi a0, sp, 128
 ; RV64VLS-NEXT:    vs1r.v v8, (a0)
-; RV64VLS-NEXT:    addi a0, sp, 192
-; RV64VLS-NEXT:    vl8re64.v v8, (a0)
 ; RV64VLS-NEXT:    addi a0, sp, 64
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
 ; RV64VLS-NEXT:    vl8re64.v v16, (a0)
 ; RV64VLS-NEXT:    addi a0, a1, 128
-; RV64VLS-NEXT:    vs8r.v v8, (a0)
-; RV64VLS-NEXT:    vs8r.v v16, (a1)
+; RV64VLS-NEXT:    vs8r.v v16, (a0)
+; RV64VLS-NEXT:    vs8r.v v8, (a1)
 ; RV64VLS-NEXT:    addi sp, s0, -80
 ; RV64VLS-NEXT:    .cfi_def_cfa sp, 80
 ; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 6782b2003ba94..ae0736682c9dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -542,11 +542,11 @@ define void @insertelt_c6_v8i64_0_add(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v12, (a1)
-; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    li a2, 6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmv.s.x v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vle64.v v12, (a1)
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    vse64.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index c628a0d620498..b8e299e67fc04 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -530,10 +530,10 @@ define void @buildvec_dominant0_v2i32(ptr %x) {
 ;
 ; RV64V-LABEL: buildvec_dominant0_v2i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    lui a1, %hi(.LCPI40_0)
-; RV64V-NEXT:    ld a1, %lo(.LCPI40_0)(a1)
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v8, -1
+; RV64V-NEXT:    lui a1, %hi(.LCPI40_0)
+; RV64V-NEXT:    ld a1, %lo(.LCPI40_0)(a1)
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV64V-NEXT:    vmv.s.x v8, a1
 ; RV64V-NEXT:    vse64.v v8, (a0)
@@ -698,15 +698,16 @@ define void @buildvec_seq_v9i8(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 73
 ; CHECK-NEXT:    vsetivli zero, 9, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 3
+; CHECK-NEXT:    vmv.v.i v8, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    li a1, 146
-; CHECK-NEXT:    vmv.s.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v9, 2, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   store <9 x i8> <i8 1, i8 2, i8 3, i8 1, i8 2, i8 3, i8 1, i8 2, i8 3>, ptr %x
@@ -935,11 +936,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16,
 ; RV32-NEXT:    li a0, 512
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 1
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v8, 3
 ; RV32-NEXT:    vadd.vi v0, v8, -1
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 1
-; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV32-NEXT:    vmerge.vim v8, v12, 0, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
@@ -947,11 +950,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16,
 ; RV64V-NEXT:    li a0, 512
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.i v12, 1
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vsrl.vi v8, v8, 2
 ; RV64V-NEXT:    vadd.vi v0, v8, -1
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64V-NEXT:    vmv.v.i v8, 1
-; RV64V-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64V-NEXT:    vmerge.vim v8, v12, 0, v0
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
@@ -959,11 +964,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16,
 ; RV64ZVE32-NEXT:    li a0, 512
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vid.v v8
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v12, 1
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vsrl.vi v8, v8, 3
 ; RV64ZVE32-NEXT:    vadd.vi v0, v8, -1
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmv.v.i v8, 1
-; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64ZVE32-NEXT:    vmerge.vim v8, v12, 0, v0
 ; RV64ZVE32-NEXT:    ret
   ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 }
@@ -973,27 +980,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    li a0, 512
 ; RV32-NEXT:    li a1, 240
-; RV32-NEXT:    vmv.s.x v8, a1
-; RV32-NEXT:    li a1, 15
-; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    vmerge.vim v9, v8, -1, v0
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV32-NEXT:    vmv.v.i v12, 3
-; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
-; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV32-NEXT:    vmv.s.x v8, a1
-; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV32-NEXT:    vmerge.vim v12, v12, 1, v0
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
-; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV32-NEXT:    vmerge.vim v8, v12, 2, v0
@@ -1003,25 +1010,23 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 3
-; RV64V-NEXT:    vmv.v.i v9, 0
+; RV64V-NEXT:    vmv.v.i v8, 0
 ; RV64V-NEXT:    li a0, 512
-; RV64V-NEXT:    vmv.v.i v8, 12
-; RV64V-NEXT:    li a1, 48
-; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64V-NEXT:    vmerge.vim v9, v8, -1, v0
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmv.v.i v12, 3
-; RV64V-NEXT:    vmv1r.v v0, v10
+; RV64V-NEXT:    vmv1r.v v0, v9
 ; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
-; RV64V-NEXT:    vmv1r.v v0, v8
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
-; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64V-NEXT:    vmv.s.x v8, a1
-; RV64V-NEXT:    vmv.v.v v0, v10
+; RV64V-NEXT:    vmv.v.i v0, 12
+; RV64V-NEXT:    li a1, 48
+; RV64V-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64V-NEXT:    vmv.v.v v0, v9
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmerge.vim v12, v12, 1, v0
-; RV64V-NEXT:    vmv1r.v v0, v8
+; RV64V-NEXT:    vmv.s.x v0, a1
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
-; RV64V-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64V-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV64V-NEXT:    vmv.v.v v0, v8
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmerge.vim v8, v12, 2, v0
@@ -1031,27 +1036,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64ZVE32:       # %bb.0:
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32-NEXT:    vmv.v.i v9, 0
+; RV64ZVE32-NEXT:    vmv.v.i v8, 0
 ; RV64ZVE32-NEXT:    li a0, 512
 ; RV64ZVE32-NEXT:    li a1, 240
-; RV64ZVE32-NEXT:    vmv.s.x v8, a1
-; RV64ZVE32-NEXT:    li a1, 15
-; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    vmerge.vim v9, v8, -1, v0
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.i v12, 3
-; RV64ZVE32-NEXT:    slli a1, a1, 8
-; RV64ZVE32-NEXT:    vmv1r.v v0, v10
+; RV64ZVE32-NEXT:    vmv1r.v v0, v9
 ; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
-; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a1
+; RV64ZVE32-NEXT:    li a1, 15
+; RV64ZVE32-NEXT:    slli a1, a1, 8
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64ZVE32-NEXT:    vmv.s.x v8, a1
-; RV64ZVE32-NEXT:    vmv.v.v v0, v10
+; RV64ZVE32-NEXT:    vmerge.vim v9, v8, -1, v0
+; RV64ZVE32-NEXT:    vmv.v.v v0, v9
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 1, v0
-; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a1
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
-; RV64ZVE32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV64ZVE32-NEXT:    vmv.v.v v0, v8
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v8, v12, 2, v0
@@ -1358,15 +1363,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 11(a0)
-; RV32-ONLY-NEXT:    li t5, 255
-; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 15(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    li a1, 255
 ; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t2
@@ -1382,6 +1385,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a1
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
@@ -1417,24 +1423,24 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-NEXT:    slli t1, t1, 24
 ; RV32VB-NEXT:    or a7, t0, a7
 ; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    lbu a5, 12(a0)
+; RV32VB-NEXT:    or a5, t1, a6
+; RV32VB-NEXT:    lbu a6, 12(a0)
 ; RV32VB-NEXT:    lbu t0, 13(a0)
-; RV32VB-NEXT:    or a6, t1, a6
 ; RV32VB-NEXT:    lbu t1, 14(a0)
 ; RV32VB-NEXT:    lbu a0, 15(a0)
 ; RV32VB-NEXT:    slli t0, t0, 8
-; RV32VB-NEXT:    or a5, a5, t0
+; RV32VB-NEXT:    or a6, a6, t0
 ; RV32VB-NEXT:    slli t1, t1, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
 ; RV32VB-NEXT:    or a0, a0, t1
 ; RV32VB-NEXT:    or a1, a1, a3
 ; RV32VB-NEXT:    or a2, a2, a7
-; RV32VB-NEXT:    or a3, a4, a6
-; RV32VB-NEXT:    or a0, a5, a0
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    or a0, a6, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -1449,29 +1455,29 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a7, 6(a0)
 ; RV32VB-PACK-NEXT:    lbu t0, 7(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 8(a0)
-; RV32VB-PACK-NEXT:    lbu t1, 9(a0)
-; RV32VB-PACK-NEXT:    lbu t2, 10(a0)
-; RV32VB-PACK-NEXT:    lbu t3, 11(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    packh a4, a5, a6
-; RV32VB-PACK-NEXT:    packh a5, a7, t0
+; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    packh a3, a5, a6
+; RV32VB-PACK-NEXT:    lbu a4, 8(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 9(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 10(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 11(a0)
+; RV32VB-PACK-NEXT:    packh a7, a7, t0
+; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    packh a5, a6, t1
 ; RV32VB-PACK-NEXT:    lbu a6, 12(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 13(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 14(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 13(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 14(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 15(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, t1
-; RV32VB-PACK-NEXT:    packh t1, t2, t3
-; RV32VB-PACK-NEXT:    packh a6, a6, a7
-; RV32VB-PACK-NEXT:    packh a0, t0, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    packh a6, a6, t0
+; RV32VB-PACK-NEXT:    packh a0, t1, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    pack a2, a3, a7
 ; RV32VB-PACK-NEXT:    pack a3, a4, a5
-; RV32VB-PACK-NEXT:    pack a2, a2, t1
 ; RV32VB-PACK-NEXT:    pack a0, a6, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1493,15 +1499,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 11(a0)
-; RV64V-ONLY-NEXT:    li t5, 255
-; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 15(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    li a1, 255
 ; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t2
@@ -1517,6 +1521,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
@@ -1577,35 +1584,35 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_contigous:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 2(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 3(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 4(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 5(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 6(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 7(a0)
-; RVA22U64-PACK-NEXT:    packh t1, a1, a2
-; RVA22U64-PACK-NEXT:    lbu t2, 8(a0)
-; RVA22U64-PACK-NEXT:    lbu t3, 9(a0)
-; RVA22U64-PACK-NEXT:    lbu t4, 10(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 2(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 3(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 4(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 5(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 6(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 7(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a6, a2
+; RVA22U64-PACK-NEXT:    packh t2, a3, a4
+; RVA22U64-PACK-NEXT:    packh t1, a5, a1
+; RVA22U64-PACK-NEXT:    lbu a4, 8(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 9(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 10(a0)
 ; RVA22U64-PACK-NEXT:    lbu a1, 11(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a6, a7
-; RVA22U64-PACK-NEXT:    packh a7, t0, a3
-; RVA22U64-PACK-NEXT:    packh t0, a4, a5
-; RVA22U64-PACK-NEXT:    lbu a5, 12(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 13(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 14(a0)
+; RVA22U64-PACK-NEXT:    packh a7, a7, t0
+; RVA22U64-PACK-NEXT:    packh a4, a4, a5
+; RVA22U64-PACK-NEXT:    packh a1, a2, a1
+; RVA22U64-PACK-NEXT:    lbu a2, 12(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 13(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 14(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 15(a0)
-; RVA22U64-PACK-NEXT:    packh a4, t2, t3
-; RVA22U64-PACK-NEXT:    packh a1, t4, a1
-; RVA22U64-PACK-NEXT:    packh a3, a5, a3
-; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packw a2, t1, a6
-; RVA22U64-PACK-NEXT:    packw a5, a7, t0
+; RVA22U64-PACK-NEXT:    packh a2, a2, a5
+; RVA22U64-PACK-NEXT:    packh a0, a3, a0
+; RVA22U64-PACK-NEXT:    packw a3, a6, t2
+; RVA22U64-PACK-NEXT:    packw a5, t1, a7
 ; RVA22U64-PACK-NEXT:    packw a1, a4, a1
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
-; RVA22U64-PACK-NEXT:    pack a2, a2, a5
+; RVA22U64-PACK-NEXT:    packw a0, a2, a0
+; RVA22U64-PACK-NEXT:    pack a2, a3, a5
 ; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
@@ -1630,15 +1637,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 9(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 10(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 11(a0)
-; RV64ZVE32-NEXT:    li t5, 255
-; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 12(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 13(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 14(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 15(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    li a1, 255
 ; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t2
@@ -1654,6 +1659,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a1
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
@@ -1732,15 +1740,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 163(a0)
-; RV32-ONLY-NEXT:    li t5, 255
-; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 144(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    li a1, 255
 ; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t5
@@ -1756,6 +1762,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a1
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
@@ -1777,38 +1786,38 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    slli a7, a7, 8
 ; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a2, 93(a0)
-; RV32VB-NEXT:    lbu a4, 105(a0)
-; RV32VB-NEXT:    lbu t2, 124(a0)
-; RV32VB-NEXT:    lbu t3, 144(a0)
-; RV32VB-NEXT:    slli a7, a7, 8
+; RV32VB-NEXT:    or a2, a6, a7
+; RV32VB-NEXT:    lbu a4, 93(a0)
+; RV32VB-NEXT:    lbu a6, 105(a0)
+; RV32VB-NEXT:    lbu a7, 124(a0)
+; RV32VB-NEXT:    lbu t2, 144(a0)
 ; RV32VB-NEXT:    slli a5, a5, 16
 ; RV32VB-NEXT:    slli t0, t0, 24
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a6, a6, a7
+; RV32VB-NEXT:    slli a4, a4, 8
 ; RV32VB-NEXT:    or a5, t0, a5
-; RV32VB-NEXT:    lbu a7, 154(a0)
-; RV32VB-NEXT:    lbu t0, 161(a0)
-; RV32VB-NEXT:    or a2, t1, a2
+; RV32VB-NEXT:    or a4, t1, a4
+; RV32VB-NEXT:    lbu t0, 154(a0)
+; RV32VB-NEXT:    lbu t1, 161(a0)
 ; RV32VB-NEXT:    lbu a0, 163(a0)
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli t0, t0, 24
-; RV32VB-NEXT:    or a4, t0, a4
+; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli t1, t1, 24
+; RV32VB-NEXT:    or a6, t1, a6
 ; RV32VB-NEXT:    slli a0, a0, 8
-; RV32VB-NEXT:    or a0, t2, a0
-; RV32VB-NEXT:    slli t3, t3, 16
-; RV32VB-NEXT:    slli a7, a7, 24
-; RV32VB-NEXT:    or a7, a7, t3
+; RV32VB-NEXT:    or a0, a7, a0
+; RV32VB-NEXT:    slli t2, t2, 16
+; RV32VB-NEXT:    slli t0, t0, 24
+; RV32VB-NEXT:    or a7, t0, t2
 ; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    or a3, a6, a5
-; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    or a2, a2, a5
+; RV32VB-NEXT:    or a3, a4, a6
 ; RV32VB-NEXT:    or a0, a0, a7
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -1824,24 +1833,24 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu t0, 75(a0)
 ; RV32VB-PACK-NEXT:    lbu t1, 82(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
-; RV32VB-PACK-NEXT:    lbu t2, 161(a0)
-; RV32VB-PACK-NEXT:    lbu t3, 163(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    packh a3, a6, a7
+; RV32VB-PACK-NEXT:    lbu a4, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 124(a0)
+; RV32VB-PACK-NEXT:    lbu t2, 144(a0)
 ; RV32VB-PACK-NEXT:    packh a5, a5, t0
-; RV32VB-PACK-NEXT:    lbu a6, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 105(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 144(a0)
-; RV32VB-PACK-NEXT:    packh a6, t1, a6
-; RV32VB-PACK-NEXT:    packh a7, a7, t2
-; RV32VB-PACK-NEXT:    packh t0, t0, t3
-; RV32VB-PACK-NEXT:    packh a0, a0, a2
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
-; RV32VB-PACK-NEXT:    pack a2, a4, a5
-; RV32VB-PACK-NEXT:    pack a3, a6, a7
-; RV32VB-PACK-NEXT:    pack a0, t0, a0
+; RV32VB-PACK-NEXT:    packh a4, t1, a4
+; RV32VB-PACK-NEXT:    lbu t0, 154(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 161(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 163(a0)
+; RV32VB-PACK-NEXT:    packh a6, a6, t1
+; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    packh a7, t2, t0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    pack a2, a3, a5
+; RV32VB-PACK-NEXT:    pack a3, a4, a6
+; RV32VB-PACK-NEXT:    pack a0, a0, a7
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
@@ -1867,15 +1876,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 163(a0)
-; RV64V-ONLY-NEXT:    li t5, 255
-; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 144(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    li a1, 255
 ; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t5
@@ -1891,6 +1898,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
@@ -1900,98 +1910,90 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 0(a0)
+; RVA22U64-NEXT:    lbu a7, 0(a0)
 ; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 22(a0)
 ; RVA22U64-NEXT:    lbu a4, 31(a0)
 ; RVA22U64-NEXT:    lbu a6, 623(a0)
-; RVA22U64-NEXT:    lbu t0, 44(a0)
-; RVA22U64-NEXT:    lbu a7, 55(a0)
-; RVA22U64-NEXT:    lbu a5, 75(a0)
+; RVA22U64-NEXT:    lbu a5, 44(a0)
+; RVA22U64-NEXT:    lbu a1, 55(a0)
+; RVA22U64-NEXT:    lbu t0, 75(a0)
 ; RVA22U64-NEXT:    lbu t1, 82(a0)
 ; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or t2, a1, a2
+; RVA22U64-NEXT:    slli a5, a5, 32
+; RVA22U64-NEXT:    slli a1, a1, 40
+; RVA22U64-NEXT:    or a7, a7, a2
 ; RVA22U64-NEXT:    or t3, a4, a3
-; RVA22U64-NEXT:    lbu a2, 93(a0)
+; RVA22U64-NEXT:    or t2, a1, a5
+; RVA22U64-NEXT:    lbu a4, 93(a0)
 ; RVA22U64-NEXT:    lbu t4, 105(a0)
-; RVA22U64-NEXT:    lbu t6, 124(a0)
+; RVA22U64-NEXT:    lbu a2, 124(a0)
 ; RVA22U64-NEXT:    lbu t5, 144(a0)
-; RVA22U64-NEXT:    slli t0, t0, 32
-; RVA22U64-NEXT:    slli a7, a7, 40
 ; RVA22U64-NEXT:    slli a6, a6, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    slli a2, a2, 8
-; RVA22U64-NEXT:    or a7, a7, t0
-; RVA22U64-NEXT:    or a5, a5, a6
-; RVA22U64-NEXT:    lbu a3, 154(a0)
+; RVA22U64-NEXT:    slli t0, t0, 56
+; RVA22U64-NEXT:    slli a4, a4, 8
+; RVA22U64-NEXT:    or a3, t0, a6
+; RVA22U64-NEXT:    or a4, t1, a4
+; RVA22U64-NEXT:    lbu a5, 154(a0)
 ; RVA22U64-NEXT:    lbu a1, 161(a0)
-; RVA22U64-NEXT:    or a2, t1, a2
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
 ; RVA22U64-NEXT:    slli t4, t4, 16
 ; RVA22U64-NEXT:    slli a1, a1, 24
 ; RVA22U64-NEXT:    or a1, a1, t4
-; RVA22U64-NEXT:    slli t6, t6, 32
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
-; RVA22U64-NEXT:    or a0, a0, t6
+; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    slli t5, t5, 48
-; RVA22U64-NEXT:    slli a3, a3, 56
-; RVA22U64-NEXT:    or a3, a3, t5
-; RVA22U64-NEXT:    or a4, t2, t3
-; RVA22U64-NEXT:    or a5, a5, a7
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    or a0, a0, a3
-; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    slli a5, a5, 56
+; RVA22U64-NEXT:    or a2, a5, t5
+; RVA22U64-NEXT:    or a5, a7, t3
+; RVA22U64-NEXT:    or a3, a3, t2
+; RVA22U64-NEXT:    or a1, a1, a4
+; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a3, a3, a5
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a4
+; RVA22U64-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    addi sp, sp, -16
-; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 16
-; RVA22U64-PACK-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
-; RVA22U64-PACK-NEXT:    .cfi_offset s0, -8
-; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu t3, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu t4, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu t5, 75(a0)
-; RVA22U64-PACK-NEXT:    lbu t1, 82(a0)
-; RVA22U64-PACK-NEXT:    packh t2, a1, a2
-; RVA22U64-PACK-NEXT:    lbu t6, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu s0, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 163(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a6, a7
-; RVA22U64-PACK-NEXT:    packh a7, t3, t4
-; RVA22U64-PACK-NEXT:    packh a2, t0, t5
+; RVA22U64-PACK-NEXT:    lbu a7, 0(a0)
+; RVA22U64-PACK-NEXT:    lbu t1, 1(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu t3, 82(a0)
+; RVA22U64-PACK-NEXT:    packh a7, a7, t1
+; RVA22U64-PACK-NEXT:    packh t2, a3, a4
+; RVA22U64-PACK-NEXT:    packh t1, a5, a1
 ; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 144(a0)
-; RVA22U64-PACK-NEXT:    packh a4, t1, a4
-; RVA22U64-PACK-NEXT:    packh a5, a5, s0
-; RVA22U64-PACK-NEXT:    packh a1, a1, a3
-; RVA22U64-PACK-NEXT:    packh a0, a0, t6
-; RVA22U64-PACK-NEXT:    packw a3, t2, a6
-; RVA22U64-PACK-NEXT:    packw a2, a7, a2
-; RVA22U64-PACK-NEXT:    packw a4, a4, a5
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a1, a3, a2
-; RVA22U64-PACK-NEXT:    pack a0, a4, a0
+; RVA22U64-PACK-NEXT:    lbu t4, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu t5, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 144(a0)
+; RVA22U64-PACK-NEXT:    packh a2, a6, t0
+; RVA22U64-PACK-NEXT:    packh a4, t3, a4
+; RVA22U64-PACK-NEXT:    lbu a5, 154(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a1, t4, a1
+; RVA22U64-PACK-NEXT:    packh a0, t5, a0
+; RVA22U64-PACK-NEXT:    packh a3, a3, a5
+; RVA22U64-PACK-NEXT:    packw a5, a7, t2
+; RVA22U64-PACK-NEXT:    packw a2, t1, a2
+; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    packw a0, a0, a3
+; RVA22U64-PACK-NEXT:    pack a2, a5, a2
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
-; RVA22U64-PACK-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
-; RVA22U64-PACK-NEXT:    .cfi_restore s0
-; RVA22U64-PACK-NEXT:    addi sp, sp, 16
-; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 0
 ; RVA22U64-PACK-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather:
@@ -2012,15 +2014,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 154(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 161(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 163(a0)
-; RV64ZVE32-NEXT:    li t5, 255
-; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 105(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 124(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 144(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    li a1, 255
 ; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t5
@@ -2036,6 +2036,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, t2
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a1
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
@@ -2118,28 +2121,28 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_low_half:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 93(a0)
-; RV32VB-NEXT:    lbu a2, 82(a0)
+; RV32VB-NEXT:    lbu a1, 82(a0)
+; RV32VB-NEXT:    lbu a2, 93(a0)
 ; RV32VB-NEXT:    lbu a3, 105(a0)
 ; RV32VB-NEXT:    lbu a4, 124(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    lbu a5, 144(a0)
-; RV32VB-NEXT:    lbu a6, 154(a0)
-; RV32VB-NEXT:    lbu a7, 161(a0)
-; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    lbu a2, 144(a0)
+; RV32VB-NEXT:    lbu a5, 154(a0)
+; RV32VB-NEXT:    lbu a6, 161(a0)
 ; RV32VB-NEXT:    lbu a0, 163(a0)
 ; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a7, a7, 24
-; RV32VB-NEXT:    or a2, a7, a3
+; RV32VB-NEXT:    slli a6, a6, 24
+; RV32VB-NEXT:    or a3, a6, a3
 ; RV32VB-NEXT:    slli a0, a0, 8
 ; RV32VB-NEXT:    or a0, a4, a0
-; RV32VB-NEXT:    slli a5, a5, 16
-; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a3, a6, a5
+; RV32VB-NEXT:    slli a2, a2, 16
+; RV32VB-NEXT:    slli a5, a5, 24
+; RV32VB-NEXT:    or a2, a5, a2
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.i v8, 0
-; RV32VB-NEXT:    or a1, a1, a2
-; RV32VB-NEXT:    or a0, a0, a3
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a0, a0, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2151,21 +2154,21 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 93(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a5
-; RV32VB-PACK-NEXT:    packh a3, a4, a6
-; RV32VB-PACK-NEXT:    packh a0, a7, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a0, a0
-; RV32VB-PACK-NEXT:    pack a2, a2, a2
+; RV32VB-PACK-NEXT:    lbu a2, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 154(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 161(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 163(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a6
+; RV32VB-PACK-NEXT:    packh a0, a4, a0
+; RV32VB-PACK-NEXT:    packh a2, a2, a5
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    packh a3, a0, a0
+; RV32VB-PACK-NEXT:    pack a3, a3, a3
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
-; RV32VB-PACK-NEXT:    pack a0, a3, a0
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vmv.v.x v8, a3
+; RV32VB-PACK-NEXT:    pack a0, a0, a2
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2193,26 +2196,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 93(a0)
-; RVA22U64-NEXT:    lbu a6, 82(a0)
-; RVA22U64-NEXT:    lbu a7, 105(a0)
+; RVA22U64-NEXT:    lbu a1, 82(a0)
+; RVA22U64-NEXT:    lbu a2, 93(a0)
+; RVA22U64-NEXT:    lbu a3, 105(a0)
 ; RVA22U64-NEXT:    lbu a4, 124(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    lbu a5, 144(a0)
-; RVA22U64-NEXT:    lbu a2, 154(a0)
-; RVA22U64-NEXT:    lbu a3, 161(a0)
-; RVA22U64-NEXT:    or a1, a6, a1
+; RVA22U64-NEXT:    slli a2, a2, 8
+; RVA22U64-NEXT:    or a6, a1, a2
+; RVA22U64-NEXT:    lbu a2, 144(a0)
+; RVA22U64-NEXT:    lbu a5, 154(a0)
+; RVA22U64-NEXT:    lbu a1, 161(a0)
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
-; RVA22U64-NEXT:    slli a7, a7, 16
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a3, a3, a7
+; RVA22U64-NEXT:    slli a3, a3, 16
+; RVA22U64-NEXT:    slli a1, a1, 24
+; RVA22U64-NEXT:    or a1, a1, a3
 ; RVA22U64-NEXT:    slli a4, a4, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
 ; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    slli a2, a2, 56
+; RVA22U64-NEXT:    slli a2, a2, 48
+; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    or a2, a2, a5
-; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    or a1, a6, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -2222,24 +2225,24 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 105(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a1, a1, a2
 ; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a6, a7
-; RVA22U64-PACK-NEXT:    packh a5, t0, a5
-; RVA22U64-PACK-NEXT:    packh a1, a4, a1
-; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packw a2, a3, a5
+; RVA22U64-PACK-NEXT:    lbu a5, 154(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a6, a3
+; RVA22U64-PACK-NEXT:    packh a0, a4, a0
+; RVA22U64-PACK-NEXT:    packh a2, a2, a5
+; RVA22U64-PACK-NEXT:    packw a1, a1, a3
 ; RVA22U64-PACK-NEXT:    packh a3, a0, a0
 ; RVA22U64-PACK-NEXT:    packw a3, a3, a3
 ; RVA22U64-PACK-NEXT:    pack a3, a3, a3
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
@@ -2319,26 +2322,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_high_half:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 22(a0)
-; RV32VB-NEXT:    lbu a3, 31(a0)
-; RV32VB-NEXT:    lbu a4, 0(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    slli a2, a2, 16
-; RV32VB-NEXT:    slli a3, a3, 24
-; RV32VB-NEXT:    or a1, a4, a1
-; RV32VB-NEXT:    lbu a4, 44(a0)
-; RV32VB-NEXT:    lbu a5, 55(a0)
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    lbu a3, 623(a0)
-; RV32VB-NEXT:    lbu a0, 75(a0)
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
+; RV32VB-NEXT:    lbu a3, 22(a0)
+; RV32VB-NEXT:    lbu a4, 31(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a3
+; RV32VB-NEXT:    slli a4, a4, 24
 ; RV32VB-NEXT:    or a1, a1, a2
-; RV32VB-NEXT:    or a0, a4, a0
+; RV32VB-NEXT:    or a3, a4, a3
+; RV32VB-NEXT:    lbu a2, 44(a0)
+; RV32VB-NEXT:    lbu a4, 55(a0)
+; RV32VB-NEXT:    lbu a5, 75(a0)
+; RV32VB-NEXT:    slli a4, a4, 8
+; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    lbu a0, 623(a0)
+; RV32VB-NEXT:    slli a0, a0, 16
+; RV32VB-NEXT:    slli a5, a5, 24
+; RV32VB-NEXT:    or a0, a5, a0
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a0, a2, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2352,14 +2355,14 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 75(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
 ; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    packh a3, a6, a7
-; RV32VB-PACK-NEXT:    packh a0, a5, a0
+; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    lbu a0, 623(a0)
+; RV32VB-PACK-NEXT:    packh a0, a0, a5
 ; RV32VB-PACK-NEXT:    pack a1, a1, a2
 ; RV32VB-PACK-NEXT:    packh a2, a0, a0
 ; RV32VB-PACK-NEXT:    pack a0, a3, a0
@@ -2395,27 +2398,27 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 22(a0)
-; RVA22U64-NEXT:    lbu a3, 31(a0)
-; RVA22U64-NEXT:    lbu a4, 0(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    slli a2, a2, 16
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a1, a1, a4
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    lbu a3, 44(a0)
+; RVA22U64-NEXT:    lbu a1, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
+; RVA22U64-NEXT:    lbu a3, 22(a0)
+; RVA22U64-NEXT:    lbu a4, 31(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
+; RVA22U64-NEXT:    slli a3, a3, 16
+; RVA22U64-NEXT:    slli a4, a4, 24
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    lbu a4, 55(a0)
-; RVA22U64-NEXT:    lbu a5, 623(a0)
-; RVA22U64-NEXT:    lbu a0, 75(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
+; RVA22U64-NEXT:    lbu a5, 75(a0)
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    slli a0, a0, 56
+; RVA22U64-NEXT:    or a2, a2, a4
+; RVA22U64-NEXT:    lbu a0, 623(a0)
+; RVA22U64-NEXT:    slli a0, a0, 48
+; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    or a0, a0, a5
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a0
@@ -2424,26 +2427,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a6, 0(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 22(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a6, a7
-; RVA22U64-PACK-NEXT:    packh a4, t0, a4
 ; RVA22U64-PACK-NEXT:    packh a1, a1, a2
-; RVA22U64-PACK-NEXT:    packh a0, a5, a0
-; RVA22U64-PACK-NEXT:    packw a2, a3, a4
-; RVA22U64-PACK-NEXT:    packh a3, a0, a0
-; RVA22U64-PACK-NEXT:    packw a3, a3, a3
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    packh a2, a3, a4
+; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a3, a4
+; RVA22U64-PACK-NEXT:    lbu a0, 623(a0)
+; RVA22U64-PACK-NEXT:    packh a0, a0, a5
+; RVA22U64-PACK-NEXT:    packw a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a2, a0, a0
+; RVA22U64-PACK-NEXT:    packw a2, a2, a2
+; RVA22U64-PACK-NEXT:    packw a0, a3, a0
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a0
-; RVA22U64-PACK-NEXT:    pack a0, a3, a3
+; RVA22U64-PACK-NEXT:    pack a0, a2, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2504,15 +2507,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV32-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV32-ONLY-NEXT:    lbu a5, 75(a0)
-; RV32-ONLY-NEXT:    li a6, 255
-; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 161(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a2
+; RV32-ONLY-NEXT:    li a2, 255
 ; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
@@ -2522,35 +2523,38 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a2
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_edges:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 623(a0)
-; RV32VB-NEXT:    lbu a2, 55(a0)
-; RV32VB-NEXT:    lbu a3, 75(a0)
-; RV32VB-NEXT:    lbu a4, 31(a0)
-; RV32VB-NEXT:    lbu a5, 44(a0)
-; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    lbu a2, 31(a0)
+; RV32VB-NEXT:    lbu a3, 44(a0)
+; RV32VB-NEXT:    lbu a4, 55(a0)
+; RV32VB-NEXT:    lbu a5, 75(a0)
+; RV32VB-NEXT:    slli a4, a4, 8
 ; RV32VB-NEXT:    slli a1, a1, 16
-; RV32VB-NEXT:    slli a3, a3, 24
-; RV32VB-NEXT:    or a2, a5, a2
-; RV32VB-NEXT:    lbu a5, 82(a0)
-; RV32VB-NEXT:    lbu a6, 93(a0)
-; RV32VB-NEXT:    or a1, a3, a1
-; RV32VB-NEXT:    lbu a3, 105(a0)
+; RV32VB-NEXT:    slli a5, a5, 24
+; RV32VB-NEXT:    or a3, a3, a4
+; RV32VB-NEXT:    or a1, a5, a1
+; RV32VB-NEXT:    lbu a4, 82(a0)
+; RV32VB-NEXT:    lbu a5, 93(a0)
+; RV32VB-NEXT:    lbu a6, 105(a0)
 ; RV32VB-NEXT:    lbu a0, 161(a0)
-; RV32VB-NEXT:    slli a6, a6, 8
-; RV32VB-NEXT:    or a5, a5, a6
-; RV32VB-NEXT:    slli a3, a3, 16
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    slli a6, a6, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a3
-; RV32VB-NEXT:    slli a4, a4, 24
-; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    or a0, a5, a0
+; RV32VB-NEXT:    or a0, a0, a6
+; RV32VB-NEXT:    slli a2, a2, 24
+; RV32VB-NEXT:    or a1, a3, a1
+; RV32VB-NEXT:    or a0, a4, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-NEXT:    vmv.v.x v8, a4
+; RV32VB-NEXT:    vmv.v.x v8, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
@@ -2563,14 +2567,14 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
 ; RV32VB-PACK-NEXT:    packh a3, a3, a4
 ; RV32VB-PACK-NEXT:    packh a1, a1, a5
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
-; RV32VB-PACK-NEXT:    packh a0, t0, a0
+; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
+; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    packh a0, a6, a0
 ; RV32VB-PACK-NEXT:    packh a5, a0, a0
 ; RV32VB-PACK-NEXT:    packh a2, a0, a2
 ; RV32VB-PACK-NEXT:    pack a2, a5, a2
@@ -2591,15 +2595,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV64V-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu a5, 75(a0)
-; RV64V-ONLY-NEXT:    li a6, 255
-; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 161(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a2
+; RV64V-ONLY-NEXT:    li a2, 255
 ; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
@@ -2609,65 +2611,68 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a2
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64:       # %bb.0:
+; RVA22U64-NEXT:    lbu a1, 623(a0)
 ; RVA22U64-NEXT:    lbu a6, 31(a0)
-; RVA22U64-NEXT:    lbu a2, 44(a0)
-; RVA22U64-NEXT:    lbu a3, 55(a0)
-; RVA22U64-NEXT:    lbu a4, 623(a0)
+; RVA22U64-NEXT:    lbu a3, 44(a0)
+; RVA22U64-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-NEXT:    lbu a5, 75(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    slli a3, a3, 32
+; RVA22U64-NEXT:    slli a4, a4, 40
+; RVA22U64-NEXT:    slli a1, a1, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    lbu a3, 82(a0)
-; RVA22U64-NEXT:    lbu a1, 93(a0)
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    lbu a5, 105(a0)
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    lbu a4, 82(a0)
+; RVA22U64-NEXT:    lbu a5, 93(a0)
+; RVA22U64-NEXT:    lbu a2, 105(a0)
 ; RVA22U64-NEXT:    lbu a0, 161(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    slli a5, a5, 16
+; RVA22U64-NEXT:    slli a5, a5, 8
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    slli a0, a0, 24
-; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    slli a6, a6, 24
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    add.uw a2, a6, a2
-; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    add.uw a1, a6, a1
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a2
+; RVA22U64-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a7, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 93(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a3, a4
+; RVA22U64-PACK-NEXT:    packh a1, a1, a5
+; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
 ; RVA22U64-PACK-NEXT:    lbu a3, 105(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 161(a0)
-; RVA22U64-PACK-NEXT:    packh a4, t0, a4
-; RVA22U64-PACK-NEXT:    packh a5, a7, a5
-; RVA22U64-PACK-NEXT:    packh a1, a2, a1
+; RVA22U64-PACK-NEXT:    packh a4, a4, a5
 ; RVA22U64-PACK-NEXT:    packh a0, a3, a0
-; RVA22U64-PACK-NEXT:    packh a2, a0, a0
-; RVA22U64-PACK-NEXT:    packh a3, a0, a6
-; RVA22U64-PACK-NEXT:    packw a3, a2, a3
-; RVA22U64-PACK-NEXT:    packw a2, a2, a2
-; RVA22U64-PACK-NEXT:    packw a4, a4, a5
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a1, a3, a4
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packh a2, a0, a2
+; RVA22U64-PACK-NEXT:    packw a2, a3, a2
+; RVA22U64-PACK-NEXT:    packw a3, a3, a3
+; RVA22U64-PACK-NEXT:    packw a1, a6, a1
+; RVA22U64-PACK-NEXT:    packw a0, a4, a0
+; RVA22U64-PACK-NEXT:    pack a1, a2, a1
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
-; RVA22U64-PACK-NEXT:    pack a0, a0, a2
+; RVA22U64-PACK-NEXT:    pack a0, a0, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2678,15 +2683,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a3, 44(a0)
 ; RV64ZVE32-NEXT:    lbu a4, 55(a0)
 ; RV64ZVE32-NEXT:    lbu a5, 75(a0)
-; RV64ZVE32-NEXT:    li a6, 255
-; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a6
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 105(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 161(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a2
+; RV64ZVE32-NEXT:    li a2, 255
 ; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
@@ -2696,6 +2699,9 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 4
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a2
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p4 = getelementptr i8, ptr %p, i32 31
@@ -2741,13 +2747,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 124(a0)
-; RV32-ONLY-NEXT:    li t1, 255
-; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV32-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 154(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    li a1, 255
 ; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
@@ -2761,37 +2765,40 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a1
+; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
 ; RV32VB-NEXT:    lbu a3, 44(a0)
 ; RV32VB-NEXT:    lbu a4, 55(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    lbu a2, 75(a0)
-; RV32VB-NEXT:    lbu a5, 82(a0)
-; RV32VB-NEXT:    lbu a6, 93(a0)
-; RV32VB-NEXT:    lbu a7, 124(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a4, a4, 8
+; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    lbu a4, 144(a0)
+; RV32VB-NEXT:    lbu a2, 75(a0)
+; RV32VB-NEXT:    lbu a4, 82(a0)
+; RV32VB-NEXT:    lbu a5, 93(a0)
+; RV32VB-NEXT:    lbu a6, 124(a0)
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    lbu a5, 144(a0)
 ; RV32VB-NEXT:    lbu a0, 154(a0)
-; RV32VB-NEXT:    slli a6, a6, 8
-; RV32VB-NEXT:    or a5, a5, a6
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli a5, a5, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a0, a0, a5
 ; RV32VB-NEXT:    slli a2, a2, 24
 ; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    or a0, a7, a0
+; RV32VB-NEXT:    or a0, a6, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a5
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -2801,26 +2808,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 144(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 154(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    lbu a0, 124(a0)
-; RV32VB-PACK-NEXT:    packh a4, a6, a7
-; RV32VB-PACK-NEXT:    packh a2, a2, t0
-; RV32VB-PACK-NEXT:    packh a5, a0, a5
-; RV32VB-PACK-NEXT:    pack a3, a3, a5
-; RV32VB-PACK-NEXT:    packh a5, a0, a0
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a0, a2
-; RV32VB-PACK-NEXT:    pack a1, a1, a5
+; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    lbu a3, 75(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 124(a0)
+; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    lbu a5, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
+; RV32VB-PACK-NEXT:    packh a0, a5, a0
+; RV32VB-PACK-NEXT:    packh a3, a0, a3
+; RV32VB-PACK-NEXT:    pack a2, a2, a3
+; RV32VB-PACK-NEXT:    packh a3, a0, a0
+; RV32VB-PACK-NEXT:    packh a5, a6, a0
+; RV32VB-PACK-NEXT:    pack a0, a5, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
-; RV32VB-PACK-NEXT:    pack a1, a4, a5
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    pack a1, a4, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2835,13 +2842,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 124(a0)
-; RV64V-ONLY-NEXT:    li t1, 255
-; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV64V-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 154(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    li a1, 255
 ; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
@@ -2855,39 +2860,42 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
+; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
+; RVA22U64-NEXT:    lbu a1, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 44(a0)
 ; RVA22U64-NEXT:    lbu a4, 55(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a6, a2, a1
-; RVA22U64-NEXT:    lbu a7, 75(a0)
-; RVA22U64-NEXT:    lbu a5, 82(a0)
-; RVA22U64-NEXT:    lbu a1, 93(a0)
-; RVA22U64-NEXT:    lbu a2, 124(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
+; RVA22U64-NEXT:    or a6, a1, a2
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a4, 144(a0)
+; RVA22U64-NEXT:    lbu a2, 75(a0)
+; RVA22U64-NEXT:    lbu a4, 82(a0)
+; RVA22U64-NEXT:    lbu a5, 93(a0)
+; RVA22U64-NEXT:    lbu a1, 124(a0)
+; RVA22U64-NEXT:    slli a5, a5, 8
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    lbu a5, 144(a0)
 ; RVA22U64-NEXT:    lbu a0, 154(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a5
-; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    slli a5, a5, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    slli a7, a7, 56
-; RVA22U64-NEXT:    or a3, a7, a3
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    or a0, a0, a2
-; RVA22U64-NEXT:    or a2, a6, a3
+; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    slli a2, a2, 56
+; RVA22U64-NEXT:    or a2, a2, a3
+; RVA22U64-NEXT:    slli a1, a1, 32
 ; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a1, a6, a2
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a2
+; RVA22U64-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
@@ -2895,29 +2903,29 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 75(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 93(a0)
-; RVA22U64-PACK-NEXT:    packh t1, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 154(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a7, t0
-; RVA22U64-PACK-NEXT:    lbu a0, 124(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a5, a3
-; RVA22U64-PACK-NEXT:    packh a2, a2, a4
-; RVA22U64-PACK-NEXT:    packh a4, a0, a6
-; RVA22U64-PACK-NEXT:    packw a1, a1, a4
-; RVA22U64-PACK-NEXT:    packh a4, a0, a0
-; RVA22U64-PACK-NEXT:    packh a0, a0, a0
-; RVA22U64-PACK-NEXT:    packw a5, t1, a4
-; RVA22U64-PACK-NEXT:    packw a0, a0, a2
-; RVA22U64-PACK-NEXT:    packw a2, a3, a4
-; RVA22U64-PACK-NEXT:    pack a1, a5, a1
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a1, a2
+; RVA22U64-PACK-NEXT:    packh a2, a3, a4
+; RVA22U64-PACK-NEXT:    lbu a3, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 124(a0)
+; RVA22U64-PACK-NEXT:    packh a4, a4, a5
+; RVA22U64-PACK-NEXT:    lbu a5, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
+; RVA22U64-PACK-NEXT:    packh a0, a5, a0
+; RVA22U64-PACK-NEXT:    packh a3, a0, a3
+; RVA22U64-PACK-NEXT:    packw a2, a2, a3
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packh a1, a1, a0
+; RVA22U64-PACK-NEXT:    packw a5, a6, a3
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    packw a1, a4, a3
+; RVA22U64-PACK-NEXT:    pack a2, a5, a2
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2931,13 +2939,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 124(a0)
-; RV64ZVE32-NEXT:    li t1, 255
-; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, t1
 ; RV64ZVE32-NEXT:    lbu t1, 144(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 154(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    li a1, 255
 ; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
@@ -2951,6 +2957,9 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a1
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p2 = getelementptr i8, ptr %p, i32 1
@@ -3011,13 +3020,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV32-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a0
 ; RV32-ONLY-NEXT:    vmv.v.x v9, a4
-; RV32-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV32-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
@@ -3064,13 +3073,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV64V-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a0
 ; RV64V-ONLY-NEXT:    vmv.v.x v9, a4
-; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
@@ -3119,13 +3128,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV64ZVE32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a0
 ; RV64ZVE32-NEXT:    vmv.v.x v9, a4
-; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a7
+; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32-NEXT:    ret
   %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index da7cdf3ba8ec0..b4e2c6617489c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -67,9 +67,9 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; RV32-V512-NEXT:    vid.v v10
 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT:    vmv.v.i v0, 10
 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
 ; RV32-V512-NEXT:    vmv.v.v v8, v10
 ; RV32-V512-NEXT:    ret
@@ -79,8 +79,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
 ; RV64-V512-NEXT:    vid.v v10
 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
-; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
+; RV64-V512-NEXT:    vmv.v.i v0, 10
 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; RV64-V512-NEXT:    vmv.v.v v8, v10
 ; RV64-V512-NEXT:    ret
@@ -416,8 +416,8 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-NEXT:    vzext.vf2 v8, v24
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vzext.vf2 v24, v0
-; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vsll.vx v8, v8, a0
+; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v24, v8, v0
 ; V128-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a5039c58fccb1..f9cc838869b8f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -86,8 +86,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI6_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vle16.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -155,20 +155,18 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) {
 define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v16, 2
+; RV32-NEXT:    li a0, 5
+; RV32-NEXT:    vslide1down.vx v20, v16, a0
 ; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI11_0)
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v20, (a0)
-; RV32-NEXT:    vmv.v.i v21, 2
+; RV32-NEXT:    vle16.v v21, (a0)
 ; RV32-NEXT:    li a0, 164
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    li a0, 5
-; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v21, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v21
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -211,8 +209,8 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_1)
 ; RV32-NEXT:    vle16.v v21, (a0)
 ; RV32-NEXT:    li a0, 113
-; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v20
+; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    vrgatherei16.vv v12, v8, v21, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
@@ -365,10 +363,10 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
 define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 66
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
+; CHECK-NEXT:    li a0, 66
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -385,9 +383,9 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v11, a0
 ; CHECK-NEXT:    li a0, 66
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -400,10 +398,10 @@ define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 67
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 4
+; CHECK-NEXT:    li a0, 67
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
@@ -420,9 +418,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v11, a0
 ; CHECK-NEXT:    li a0, 66
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgather.vi v10, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
@@ -434,16 +432,16 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i2we4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 4
+; CHECK-NEXT:    vmv.v.i v11, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vrgather.vi v10, v8, 2
 ; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 2
+; CHECK-NEXT:    vslideup.vi v12, v11, 2
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v10, v8, 2
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v9, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 12, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -453,13 +451,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI26_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI26_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    li a0, 20
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI26_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI26_0)
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
@@ -670,10 +668,10 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, -22
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    li a0, -22
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
@@ -684,13 +682,13 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: unmergable:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    li a0, 84
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
@@ -977,12 +975,11 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 7
-; CHECK-NEXT:    vmv.v.i v11, 1
+; CHECK-NEXT:    vmv.v.i v10, 1
 ; CHECK-NEXT:    li a0, 192
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vmerge.vim v11, v11, 0, v0
-; CHECK-NEXT:    vmv.v.v v0, v10
-; CHECK-NEXT:    vmerge.vim v12, v11, 2, v0
+; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmerge.vim v12, v10, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
@@ -1060,18 +1057,18 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
 define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
 ; CHECK-LABEL: shuffle_disjoint_lanes:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI74_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI74_0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI74_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI74_0)
 ; CHECK-NEXT:    vmerge.vvm v12, v12, v8, v0
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v18, v16
+; CHECK-NEXT:    vsext.vf2 v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
   ret <16 x i32> %out
@@ -1098,12 +1095,12 @@ define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32
 ; CHECK-NEXT:    lui a0, %hi(.LCPI76_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI76_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    vrgather.vi v16, v8, 7
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 15
 ; CHECK-NEXT:    addi a0, a0, 240
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vi v16, v8, 7
-; CHECK-NEXT:    vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v16, v12, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
@@ -1113,14 +1110,14 @@ define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32
 define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
 ; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI77_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI77_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT:    vle16.v v16, (a1)
-; CHECK-NEXT:    lui a1, 15
-; CHECK-NEXT:    addi a1, a1, 240
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI77_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI77_0)
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    lui a0, 15
+; CHECK-NEXT:    addi a0, a0, 240
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -1249,14 +1246,14 @@ define void @shuffle_i128_ldst(ptr %p) {
 define void @shuffle_i256_ldst(ptr %p) {
 ; CHECK-LABEL: shuffle_i256_ldst:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI80_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI80_0)
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vle8.v v16, (a1)
+; CHECK-NEXT:    vsext.vf2 v18, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v10
+; CHECK-NEXT:    vrgatherei16.vv v24, v8, v18
 ; CHECK-NEXT:    vse64.v v24, (a0)
 ; CHECK-NEXT:    ret
   %a = load <4 x i256>, ptr %p
@@ -1361,8 +1358,8 @@ define <16 x i32> @shuffle_m2_prefix(<16 x i32> %a) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI85_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI85_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle16.v v14, (a0)
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
   %out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 5, i32 2, i32 3, i32 5, i32 7, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll
index 32c1f2ca32fab..a5e730d47395d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll
@@ -77,10 +77,10 @@ define void @gather_const_v2i64(ptr %x) {
 define void @gather_const_v64i8(ptr %x) {
 ; CHECK-LABEL: gather_const_v64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 32(a0)
-; CHECK-NEXT:    li a2, 64
-; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    lbu a2, 32(a0)
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <64 x i8>, ptr %x
@@ -94,10 +94,10 @@ define void @gather_const_v64i8(ptr %x) {
 define void @gather_const_v16i16(ptr %x) {
 ; CHECK-LABEL: gather_const_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 50(a0)
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    lh a2, 50(a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <32 x i16>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 392709fdb4cf7..e6514cfe7d473 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1046,46 +1046,45 @@ define void @mulhu_v16i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    lui a2, %hi(.LCPI65_0)
-; CHECK-NEXT:    addi a2, a2, %lo(.LCPI65_0)
-; CHECK-NEXT:    vle8.v v11, (a2)
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    li a2, 513
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    addi a1, a2, 32
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmv.s.x v9, a1
 ; CHECK-NEXT:    li a1, -128
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vxm v12, v10, a1, v0
-; CHECK-NEXT:    li a1, 513
-; CHECK-NEXT:    vmv.v.i v13, 4
+; CHECK-NEXT:    vmerge.vxm v12, v11, a1, v0
+; CHECK-NEXT:    lui a1, %hi(.LCPI65_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI65_0)
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v11, 1, v0
+; CHECK-NEXT:    vmv.v.i v11, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vle8.v v11, (a1)
 ; CHECK-NEXT:    addi a1, a2, 78
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v13, 1, v0
+; CHECK-NEXT:    vsrl.vv v9, v10, v9
+; CHECK-NEXT:    vmulhu.vv v9, v9, v11
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsrl.vv v8, v9, v8
-; CHECK-NEXT:    vmulhu.vv v8, v8, v11
-; CHECK-NEXT:    vmerge.vim v10, v10, 3, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 3, v0
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a1, a1, 304
-; CHECK-NEXT:    vsub.vv v9, v9, v8
-; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    vsub.vv v10, v10, v9
+; CHECK-NEXT:    vmulhu.vv v10, v10, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vmerge.vim v9, v10, 2, v0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vadd.vv v9, v10, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
+; CHECK-NEXT:    vsrl.vv v8, v9, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <16 x i8>, ptr %x
@@ -1108,20 +1107,20 @@ define void @mulhu_v8i16(ptr %x) {
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI66_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 3
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    vmerge.vim v11, v11, 2, v0
-; CHECK-NEXT:    vmv1r.v v13, v9
+; CHECK-NEXT:    vmv1r.v v12, v9
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v11, v11, 2, v0
+; CHECK-NEXT:    vle16.v v13, (a1)
 ; CHECK-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    vmulhu.vv v9, v9, v13
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v13, a1
+; CHECK-NEXT:    vmv.s.x v12, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v13
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 6
@@ -1162,13 +1161,13 @@ define void @mulhu_v4i32(ptr %x) {
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI68_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI68_0)
-; CHECK-NEXT:    vle32.v v11, (a1)
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a1)
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v11
+; CHECK-NEXT:    vmulhu.vv v10, v8, v10
 ; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv.s.x v9, a1
@@ -1206,8 +1205,6 @@ define void @mulhu_v2i64(ptr %x) {
 ;
 ; RV64-LABEL: mulhu_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 838861
 ; RV64-NEXT:    lui a2, 699051
 ; RV64-NEXT:    addiw a1, a1, -819
@@ -1216,6 +1213,8 @@ define void @mulhu_v2i64(ptr %x) {
 ; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV64-NEXT:    vmv.s.x v9, a2
@@ -1322,10 +1321,10 @@ define void @mulhs_v4i32(ptr %x) {
 ;
 ; RV64-LABEL: mulhs_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, %hi(.LCPI73_0)
-; RV64-NEXT:    ld a1, %lo(.LCPI73_0)(a1)
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    lui a1, %hi(.LCPI73_0)
+; RV64-NEXT:    ld a1, %lo(.LCPI73_0)(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v9, a1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -3153,27 +3152,27 @@ define void @mulhu_v32i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    lui a2, 163907
-; CHECK-NEXT:    addi a2, a2, -2044
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
-; CHECK-NEXT:    lui a2, 66049
-; CHECK-NEXT:    addi a2, a2, 32
-; CHECK-NEXT:    vmv.s.x v8, a2
-; CHECK-NEXT:    li a2, -128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    lui a2, %hi(.LCPI181_0)
+; CHECK-NEXT:    addi a2, a2, %lo(.LCPI181_0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vle8.v v10, (a2)
+; CHECK-NEXT:    lui a1, 163907
+; CHECK-NEXT:    addi a1, a1, -2044
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 66049
+; CHECK-NEXT:    addi a1, a1, 32
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    li a1, -128
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vxm v10, v12, a2, v0
-; CHECK-NEXT:    lui a1, %hi(.LCPI181_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI181_0)
-; CHECK-NEXT:    vle8.v v14, (a0)
+; CHECK-NEXT:    vmerge.vxm v14, v12, a1, v0
+; CHECK-NEXT:    lui a1, 8208
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    vle8.v v12, (a1)
-; CHECK-NEXT:    lui a1, 8208
+; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    addi a1, a1, 513
-; CHECK-NEXT:    vsrl.vv v8, v14, v8
-; CHECK-NEXT:    vmulhu.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v12, v8
+; CHECK-NEXT:    vmulhu.vv v10, v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 66785
@@ -3181,8 +3180,8 @@ define void @mulhu_v32i8(ptr %x) {
 ; CHECK-NEXT:    vmv.s.x v8, a1
 ; CHECK-NEXT:    lui a1, 529160
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vsub.vv v14, v14, v12
-; CHECK-NEXT:    vmulhu.vv v10, v14, v10
+; CHECK-NEXT:    vsub.vv v12, v12, v10
+; CHECK-NEXT:    vmulhu.vv v12, v12, v14
 ; CHECK-NEXT:    vmv.v.i v14, 4
 ; CHECK-NEXT:    addi a1, a1, 304
 ; CHECK-NEXT:    vmerge.vim v14, v14, 1, v0
@@ -3191,7 +3190,7 @@ define void @mulhu_v32i8(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v14, v14, 3, v0
-; CHECK-NEXT:    vadd.vv v10, v10, v12
+; CHECK-NEXT:    vadd.vv v10, v12, v10
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v14, 2, v0
 ; CHECK-NEXT:    vsrl.vv v8, v10, v8
@@ -3291,11 +3290,11 @@ define void @mulhu_v8i32(ptr %x) {
 ; CHECK-NEXT:    li a1, 68
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI183_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI183_0)
 ; CHECK-NEXT:    vle32.v v12, (a1)
-; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
 ; CHECK-NEXT:    vmulhu.vv v12, v8, v12
@@ -3450,10 +3449,10 @@ define void @mulhs_v8i32(ptr %x) {
 ;
 ; RV64-LABEL: mulhs_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, %hi(.LCPI187_0)
-; RV64-NEXT:    ld a1, %lo(.LCPI187_0)(a1)
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    lui a1, %hi(.LCPI187_0)
+; RV64-NEXT:    ld a1, %lo(.LCPI187_0)(a1)
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
@@ -3507,6 +3506,8 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 5
 ; RV64-NEXT:    lui a2, 1044496
 ; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, -256
@@ -3514,12 +3515,10 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI188_0)
-; RV64-NEXT:    ld a2, %lo(.LCPI188_0)(a2)
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 5
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    ld a2, %lo(.LCPI188_0)(a2)
 ; RV64-NEXT:    vmerge.vxm v12, v12, a2, v0
 ; RV64-NEXT:    lui a2, 4096
 ; RV64-NEXT:    addi a2, a2, 256
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index b65352aed2d52..211c434c65743 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,55 +7,53 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 0(a0)
-; ZVE32X-NEXT:    ld a2, 8(a0)
-; ZVE32X-NEXT:    ld a3, 24(a0)
-; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 48(a0)
-; ZVE32X-NEXT:    ld a6, 56(a0)
-; ZVE32X-NEXT:    ld a7, 72(a0)
-; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    ld a1, 48(a0)
+; ZVE32X-NEXT:    ld a2, 56(a0)
+; ZVE32X-NEXT:    ld a3, 72(a0)
+; ZVE32X-NEXT:    ld a4, 80(a0)
+; ZVE32X-NEXT:    ld a5, 0(a0)
+; ZVE32X-NEXT:    ld a6, 8(a0)
+; ZVE32X-NEXT:    ld a7, 24(a0)
+; ZVE32X-NEXT:    ld a0, 32(a0)
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
 ; ZVE32X-NEXT:    vmv.v.i v9, 0
-; ZVE32X-NEXT:    xor a3, a3, a4
-; ZVE32X-NEXT:    xor a1, a1, a2
-; ZVE32X-NEXT:    xor a2, a5, a6
 ; ZVE32X-NEXT:    xor a0, a7, a0
-; ZVE32X-NEXT:    snez a3, a3
+; ZVE32X-NEXT:    xor a5, a5, a6
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    snez a0, a0
+; ZVE32X-NEXT:    snez a2, a5
 ; ZVE32X-NEXT:    snez a1, a1
-; ZVE32X-NEXT:    vmv.s.x v10, a3
-; ZVE32X-NEXT:    vmv.s.x v11, a1
+; ZVE32X-NEXT:    snez a3, a3
+; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    vmv.s.x v11, a2
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
+; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
-; ZVE32X-NEXT:    vand.vi v10, v11, 1
-; ZVE32X-NEXT:    vmerge.vim v11, v8, 1, v0
-; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
-; ZVE32X-NEXT:    snez a1, a2
+; ZVE32X-NEXT:    vmerge.vim v10, v8, 1, v0
+; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmerge.vim v10, v9, 1, v0
+; ZVE32X-NEXT:    vmerge.vim v11, v9, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 2, e8, mf4, tu, ma
-; ZVE32X-NEXT:    vslideup.vi v10, v11, 1
-; ZVE32X-NEXT:    vmv.s.x v11, a1
-; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
+; ZVE32X-NEXT:    vslideup.vi v11, v10, 1
+; ZVE32X-NEXT:    vmv.s.x v10, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vand.vi v10, v11, 1
+; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v11, v9, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
-; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmerge.vim v10, v8, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
 ; ZVE32X-NEXT:    vslideup.vi v11, v10, 2
-; ZVE32X-NEXT:    vmv.s.x v10, a0
-; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    vmv.s.x v10, a3
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v9, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 1516c67bf7ecc..e1f834b263782 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,10 +183,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 88
+; RV32-NEXT:    li a3, 84
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    addi a4, a1, 128
 ; RV32-NEXT:    li a2, 32
@@ -194,79 +194,127 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a6, %hi(.LCPI8_0)
 ; RV32-NEXT:    addi a6, a6, %lo(.LCPI8_0)
 ; RV32-NEXT:    li a7, 768
-; RV32-NEXT:    lui t0, 49164
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    lui a1, 49164
+; RV32-NEXT:    vle32.v v24, (a4)
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li t0, 60
+; RV32-NEXT:    mul a4, a4, t0
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a4, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_1)
+; RV32-NEXT:    addi a5, a5, 3
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vle16.v v8, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li t0, 76
+; RV32-NEXT:    mul a6, a6, t0
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs2r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v8, a7
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 36
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs1r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a1, 12
+; RV32-NEXT:    vle16.v v8, (a4)
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a6, 28
+; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v0, a5
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li t1, 72
-; RV32-NEXT:    mul a1, a1, t1
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v8, (a4)
+; RV32-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a4, 68
+; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a5, a5, 3
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v6, (a6)
-; RV32-NEXT:    vmv.s.x v0, a5
-; RV32-NEXT:    lui a1, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_1)
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a4, 76
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v6
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v0, v16, v24
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a4, 44
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
-; RV32-NEXT:    addi t0, t0, 12
-; RV32-NEXT:    vmv.s.x v0, a7
-; RV32-NEXT:    vmv.s.x v7, t0
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v4, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v20, v24, v16, v0
+; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 20
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 60
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v24, v8, v16, v0
+; RV32-NEXT:    vmerge.vvm v24, v16, v8, v0
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -278,23 +326,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    slli a1, a1, 10
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vle16.v v14, (a4)
 ; RV32-NEXT:    vmv.s.x v12, a3
+; RV32-NEXT:    vle16.v v14, (a4)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
@@ -303,7 +350,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 68
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -323,326 +377,312 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a5, 768
 ; RV32-NEXT:    li a6, 48
 ; RV32-NEXT:    lui a7, 3073
-; RV32-NEXT:    li t0, 192
 ; RV32-NEXT:    addi a1, a1, 3
 ; RV32-NEXT:    addi a3, a3, 192
 ; RV32-NEXT:    addi a4, a4, 12
 ; RV32-NEXT:    addi a5, a5, 768
 ; RV32-NEXT:    addi a7, a7, -1024
-; RV32-NEXT:    vmv.s.x v13, a6
-; RV32-NEXT:    vmv.s.x v2, t0
+; RV32-NEXT:    vmv.s.x v2, a6
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vmv.s.x v12, a3
-; RV32-NEXT:    vmv.s.x v3, a4
-; RV32-NEXT:    vmv.s.x v14, a5
-; RV32-NEXT:    vmv.s.x v1, a7
+; RV32-NEXT:    vmv.s.x v8, a3
+; RV32-NEXT:    vmv.s.x v20, a4
+; RV32-NEXT:    vmv.s.x v1, a5
+; RV32-NEXT:    vmv.s.x v3, a7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 52
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv4r.v v8, v16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v20, v8, v16, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v12
+; RV32-NEXT:    vmerge.vvm v4, v16, v24, v0
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v24, v16, v24, v0
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vmv1r.v v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v4, v8, v24, v0
-; RV32-NEXT:    vmv1r.v v0, v14
+; RV32-NEXT:    vmerge.vvm v20, v16, v24, v0
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v24, v16, v24, v0
+; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v13
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 80
+; RV32-NEXT:    li a3, 76
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v12, v8, v24, v0
+; RV32-NEXT:    vmerge.vvm v12, v16, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 20
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 72
+; RV32-NEXT:    li a3, 68
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
+; RV32-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 192
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 80
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 32
 ; RV32-NEXT:    addi a1, a1, 4
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 36
+; RV32-NEXT:    li a2, 20
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v12
+; RV32-NEXT:    vrgatherei16.vv v8, v12, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v16, v8
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 80
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 48
-; RV32-NEXT:    lui a2, %hi(.LCPI8_3)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_3)
 ; RV32-NEXT:    addi a1, a1, 5
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v25, a1
+; RV32-NEXT:    vmv.v.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v25
+; RV32-NEXT:    vrgatherei16.vv v8, v12, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 36
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v16, v8
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 36
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI8_3)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_3)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v24
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vrgatherei16.vv v12, v4, v24
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 28
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v16, v8
+; RV32-NEXT:    vmv.v.v v12, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 52
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_5)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v26, (a1)
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v24, (a2)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v26
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV32-NEXT:    lui a1, %hi(.LCPI8_5)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_5)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v20, v4, v24
+; RV32-NEXT:    vle16.v v28, (a1)
+; RV32-NEXT:    vrgatherei16.vv v8, v20, v28
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a2, 28
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
+; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 12
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v0, v16, v24
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_8)
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v12, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v16, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vle16.v v18, (a1)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 20
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV32-NEXT:    vle16.v v20, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v20
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v0
+; RV32-NEXT:    lui a1, %hi(.LCPI8_8)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_8)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 72
+; RV32-NEXT:    li a2, 68
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v8, v4, v18
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v20, v28, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v8, v0
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    vse32.v v24, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v20, (a1)
-; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 28
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
+; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 52
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl4r.v v12, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vse32.v v12, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 80
+; RV32-NEXT:    li a2, 76
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    li a1, 84
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -659,463 +699,419 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb
-; RV64-NEXT:    addi a3, a1, 128
-; RV64-NEXT:    addi a6, a1, 256
-; RV64-NEXT:    li a4, 128
-; RV64-NEXT:    lui a2, 1
-; RV64-NEXT:    lui a5, %hi(.LCPI8_0)
-; RV64-NEXT:    addi a5, a5, %lo(.LCPI8_0)
+; RV64-NEXT:    addi a3, a1, 256
+; RV64-NEXT:    li a2, 128
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v16, 6
+; RV64-NEXT:    lui a4, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (a6)
-; RV64-NEXT:    lui a6, 16
-; RV64-NEXT:    addi a6, a6, 7
+; RV64-NEXT:    vle64.v v8, (a3)
+; RV64-NEXT:    addi a4, a4, 7
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v17, a6
-; RV64-NEXT:    addi a6, a2, 65
+; RV64-NEXT:    vmv.v.x v17, a4
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v4, v8, 4
+; RV64-NEXT:    vrgather.vi v24, v8, 4
 ; RV64-NEXT:    vrgather.vi v20, v8, 5
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    li t0, 84
-; RV64-NEXT:    mul a7, a7, t0
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 76
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs4r.v v20, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v20, v8, v16
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 6
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 84
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs4r.v v20, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v20, v8, v17
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    li t0, 56
-; RV64-NEXT:    mul a7, a7, t0
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 80
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs4r.v v20, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v16, v8, 2
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    li t0, 72
-; RV64-NEXT:    mul a7, a7, t0
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 72
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs4r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v16, v8, 3
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    li t0, 68
-; RV64-NEXT:    mul a7, a7, t0
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 6
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs4r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 8
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    li t0, 40
-; RV64-NEXT:    mul a7, a7, t0
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v0, a4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 48
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v7, a2
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vs1r.v v7, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vrgather.vi v24, v8, 2, v0.t
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 68
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs4r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, a1, 128
+; RV64-NEXT:    lui a2, 1
+; RV64-NEXT:    lui a4, %hi(.LCPI8_0)
+; RV64-NEXT:    addi a4, a4, %lo(.LCPI8_0)
+; RV64-NEXT:    addi a5, a2, 65
+; RV64-NEXT:    vmv.s.x v0, a5
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle16.v v2, (a4)
+; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    mul a1, a1, a4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle64.v v16, (a3)
+; RV64-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    li a3, 40
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vle16.v v12, (a5)
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v2, a6
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v4, v8, 2, v0.t
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 60
+; RV64-NEXT:    li a3, 76
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v2
-; RV64-NEXT:    vmv8r.v v8, v24
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 48
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmerge.vvm v24, v16, v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v0, v24, v16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vrgather.vi v24, v8, 3, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    li a3, 76
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 2
 ; RV64-NEXT:    lui a3, %hi(.LCPI8_1)
 ; RV64-NEXT:    addi a3, a3, %lo(.LCPI8_1)
 ; RV64-NEXT:    addi a1, a1, 130
-; RV64-NEXT:    vle16.v v16, (a3)
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 3
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs2r.v v16, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v2, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 84
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 40
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v24, v16, 3, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle16.v v8, (a3)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 84
-; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v2
+; RV64-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v0, v24, v8
+; RV64-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v0, v24, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 4
-; RV64-NEXT:    lui a3, 8
 ; RV64-NEXT:    addi a1, a1, 260
-; RV64-NEXT:    addi a3, a3, 520
 ; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    vmv.s.x v2, a3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    li a3, 40
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmerge.vvm v8, v16, v24, v0
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl1r.v v7, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    li a3, 84
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 40
+; RV64-NEXT:    li a3, 48
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v16, 4, v0.t
+; RV64-NEXT:    vrgather.vi v24, v16, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    li a3, 84
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v2
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, 8
+; RV64-NEXT:    addi a1, a1, 520
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 76
+; RV64-NEXT:    li a3, 40
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmerge.vvm v16, v16, v24, v0
+; RV64-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    li a3, 80
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v24, v8, 5, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    li a3, 80
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a3, 192
-; RV64-NEXT:    vmv.s.x v3, a3
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v24, a1
-; RV64-NEXT:    vmv1r.v v0, v3
+; RV64-NEXT:    vmv.v.x v2, a1
+; RV64-NEXT:    vmv.s.x v3, a3
+; RV64-NEXT:    vmv.v.v v0, v3
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 72
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v28, v8, v24, v0.t
-; RV64-NEXT:    vmv4r.v v16, v8
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v2, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 72
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI8_2)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_2)
-; RV64-NEXT:    li a3, 1040
-; RV64-NEXT:    lui a4, 112
-; RV64-NEXT:    addi a4, a4, 1
-; RV64-NEXT:    vmv.s.x v0, a3
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a1, 1040
+; RV64-NEXT:    lui a3, 112
+; RV64-NEXT:    addi a3, a3, 1
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v5, a4
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle16.v v6, (a1)
+; RV64-NEXT:    vmv.v.x v12, a3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 76
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmerge.vvm v24, v8, v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vmerge.vvm v24, v16, v24, v0
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 68
-; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    slli a1, a1, 6
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v28, v16, v5, v0.t
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v12, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 68
-; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    slli a1, a1, 6
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a1, a2, -2016
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, %hi(.LCPI8_2)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_2)
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v6
+; RV64-NEXT:    vle16.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v0, v8, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 48
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, a2, -2016
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 76
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 76
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
-; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 60
+; RV64-NEXT:    li a2, 68
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 24
-; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v0
+; RV64-NEXT:    vmv.v.v v16, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 60
+; RV64-NEXT:    li a2, 68
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 84
+; RV64-NEXT:    li a2, 76
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    li a2, 24
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v20, v8
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 84
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 6
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 40
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v16, v24
+; RV64-NEXT:    vmv.v.v v8, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    li a2, 84
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v0, v24, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV64-NEXT:    vle16.v v8, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
-; RV64-NEXT:    vle16.v v10, (a1)
+; RV64-NEXT:    vle16.v v16, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs2r.v v10, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    li a2, 80
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 5
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v8
+; RV64-NEXT:    vle16.v v16, (a1)
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v0, v16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 72
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle16.v v16, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 76
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v0, v16
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 6
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl2r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v0, v20
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 68
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v28, v24
+; RV64-NEXT:    vmv.v.v v16, v24
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 320
-; RV64-NEXT:    vse64.v v28, (a1)
+; RV64-NEXT:    vse64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
-; RV64-NEXT:    vse64.v v12, (a1)
+; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
-; RV64-NEXT:    vse64.v v16, (a1)
-; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    li a3, 84
 ; RV64-NEXT:    mul a2, a2, a3
@@ -1123,8 +1119,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    addi a1, a0, 64
+; RV64-NEXT:    vse64.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 60
+; RV64-NEXT:    li a2, 68
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index f27614c93985f..118408d40c669 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -612,50 +612,51 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-NEXT:    vslidedown.vi v13, v8, 2
+; RV64-NEXT:    vslidedown.vi v14, v8, 1
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v8, 7
+; RV64-NEXT:    vslidedown.vi v18, v8, 6
+; RV64-NEXT:    vslidedown.vi v20, v8, 5
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    flw fa5, 124(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v8
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 248(sp)
-; RV64-NEXT:    flw fa5, 120(sp)
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    fcvt.l.s a0, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    vfmv.f.s fa5, v12
 ; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    sd a1, 240(sp)
-; RV64-NEXT:    flw fa5, 116(sp)
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v14, v8, 7
-; RV64-NEXT:    fcvt.l.s a1, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v11
+; RV64-NEXT:    vfmv.f.s fa5, v13
 ; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    sd a2, 232(sp)
-; RV64-NEXT:    flw fa5, 112(sp)
-; RV64-NEXT:    fcvt.l.s a2, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v12
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vfmv.f.s fa5, v14
 ; RV64-NEXT:    fcvt.l.s a3, fa5
-; RV64-NEXT:    sd a3, 224(sp)
-; RV64-NEXT:    flw fa5, 108(sp)
-; RV64-NEXT:    fcvt.l.s a3, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v14
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    vfmv.f.s fa5, v16
 ; RV64-NEXT:    fcvt.l.s a4, fa5
-; RV64-NEXT:    sd a4, 216(sp)
-; RV64-NEXT:    flw fa5, 104(sp)
-; RV64-NEXT:    fcvt.l.s a4, fa4
-; RV64-NEXT:    vfmv.f.s fa4, v10
-; RV64-NEXT:    fcvt.l.s a5, fa4
+; RV64-NEXT:    vfmv.f.s fa5, v18
+; RV64-NEXT:    fcvt.l.s a5, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v20
 ; RV64-NEXT:    fcvt.l.s a6, fa5
-; RV64-NEXT:    sd a6, 208(sp)
+; RV64-NEXT:    flw fa5, 124(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 248(sp)
+; RV64-NEXT:    flw fa5, 120(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 240(sp)
+; RV64-NEXT:    flw fa5, 116(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 232(sp)
+; RV64-NEXT:    flw fa5, 112(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 224(sp)
+; RV64-NEXT:    flw fa5, 108(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 216(sp)
+; RV64-NEXT:    flw fa5, 104(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    sd a7, 208(sp)
 ; RV64-NEXT:    flw fa5, 100(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v12
-; RV64-NEXT:    fcvt.l.s a6, fa4
-; RV64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    sd a7, 200(sp)
@@ -981,26 +982,27 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    mv a0, sp
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v14, v8, 3
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    fld fa5, 56(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v8
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    fcvt.l.d a0, fa4
-; RV64-NEXT:    fcvt.l.d a1, fa5
-; RV64-NEXT:    sd a1, 120(sp)
-; RV64-NEXT:    fld fa5, 48(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    fcvt.l.d a1, fa4
+; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v12
+; RV64-NEXT:    fcvt.l.d a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v14
 ; RV64-NEXT:    fcvt.l.d a2, fa5
-; RV64-NEXT:    sd a2, 112(sp)
+; RV64-NEXT:    fld fa5, 56(sp)
+; RV64-NEXT:    fcvt.l.d a3, fa5
+; RV64-NEXT:    sd a3, 120(sp)
+; RV64-NEXT:    fld fa5, 48(sp)
+; RV64-NEXT:    fcvt.l.d a3, fa5
+; RV64-NEXT:    sd a3, 112(sp)
 ; RV64-NEXT:    fld fa5, 40(sp)
-; RV64-NEXT:    vfmv.f.s fa4, v10
-; RV64-NEXT:    fcvt.l.d a2, fa4
-; RV64-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    sd a3, 104(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 2f58e3dd2769f..23ecc74880c6a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -363,50 +363,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    mv a0, sp
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vslidedown.vi v13, v8, 2
+; RV32-NEXT:    vslidedown.vi v14, v8, 1
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v8, 7
+; RV32-NEXT:    vslidedown.vi v18, v8, 6
+; RV32-NEXT:    vslidedown.vi v20, v8, 5
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    flw fa5, 60(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v8
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vslidedown.vi v11, v8, 2
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 4
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 124(sp)
-; RV32-NEXT:    flw fa5, 56(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa4
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vfmv.f.s fa5, v12
 ; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    sw a1, 120(sp)
-; RV32-NEXT:    flw fa5, 52(sp)
-; RV32-NEXT:    fcvt.w.s a1, fa4
-; RV32-NEXT:    vfmv.f.s fa4, v11
-; RV32-NEXT:    fcvt.w.s a2, fa4
+; RV32-NEXT:    vfmv.f.s fa5, v13
+; RV32-NEXT:    fcvt.w.s a2, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v14
 ; RV32-NEXT:    fcvt.w.s a3, fa5
-; RV32-NEXT:    sw a3, 116(sp)
-; RV32-NEXT:    flw fa5, 48(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    fcvt.w.s a3, fa4
+; RV32-NEXT:    vfmv.f.s fa5, v16
 ; RV32-NEXT:    fcvt.w.s a4, fa5
-; RV32-NEXT:    sw a4, 112(sp)
-; RV32-NEXT:    flw fa5, 44(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    fcvt.w.s a4, fa4
+; RV32-NEXT:    vfmv.f.s fa5, v18
 ; RV32-NEXT:    fcvt.w.s a5, fa5
-; RV32-NEXT:    sw a5, 108(sp)
-; RV32-NEXT:    flw fa5, 40(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    fcvt.w.s a5, fa4
+; RV32-NEXT:    vfmv.f.s fa5, v20
 ; RV32-NEXT:    fcvt.w.s a6, fa5
-; RV32-NEXT:    sw a6, 104(sp)
+; RV32-NEXT:    flw fa5, 60(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 124(sp)
+; RV32-NEXT:    flw fa5, 56(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 120(sp)
+; RV32-NEXT:    flw fa5, 52(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 116(sp)
+; RV32-NEXT:    flw fa5, 48(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 112(sp)
+; RV32-NEXT:    flw fa5, 44(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 108(sp)
+; RV32-NEXT:    flw fa5, 40(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    sw a7, 104(sp)
 ; RV32-NEXT:    flw fa5, 36(sp)
-; RV32-NEXT:    vfmv.f.s fa4, v10
-; RV32-NEXT:    fcvt.w.s a6, fa4
-; RV32-NEXT:    vslidedown.vi v8, v8, 4
 ; RV32-NEXT:    fcvt.w.s a7, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    sw a7, 100(sp)
@@ -447,50 +448,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i32-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-i32-NEXT:    andi sp, sp, -64
 ; RV64-i32-NEXT:    mv a0, sp
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-i32-NEXT:    vslidedown.vi v13, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v14, v8, 1
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v16, v8, 7
+; RV64-i32-NEXT:    vslidedown.vi v18, v8, 6
+; RV64-i32-NEXT:    vslidedown.vi v20, v8, 5
 ; RV64-i32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i32-NEXT:    vse32.v v8, (a0)
-; RV64-i32-NEXT:    flw fa5, 60(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v8
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i32-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 124(sp)
-; RV64-i32-NEXT:    flw fa5, 56(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa4
-; RV64-i32-NEXT:    vfmv.f.s fa4, v10
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-i32-NEXT:    vfmv.f.s fa5, v12
 ; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    sw a1, 120(sp)
-; RV64-i32-NEXT:    flw fa5, 52(sp)
-; RV64-i32-NEXT:    fcvt.l.s a1, fa4
-; RV64-i32-NEXT:    vfmv.f.s fa4, v11
-; RV64-i32-NEXT:    fcvt.l.s a2, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa5, v13
+; RV64-i32-NEXT:    fcvt.l.s a2, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v14
 ; RV64-i32-NEXT:    fcvt.l.s a3, fa5
-; RV64-i32-NEXT:    sw a3, 116(sp)
-; RV64-i32-NEXT:    flw fa5, 48(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v10
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-i32-NEXT:    fcvt.l.s a3, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa5, v16
 ; RV64-i32-NEXT:    fcvt.l.s a4, fa5
-; RV64-i32-NEXT:    sw a4, 112(sp)
-; RV64-i32-NEXT:    flw fa5, 44(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v10
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-i32-NEXT:    fcvt.l.s a4, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa5, v18
 ; RV64-i32-NEXT:    fcvt.l.s a5, fa5
-; RV64-i32-NEXT:    sw a5, 108(sp)
-; RV64-i32-NEXT:    flw fa5, 40(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v10
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-i32-NEXT:    fcvt.l.s a5, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa5, v20
 ; RV64-i32-NEXT:    fcvt.l.s a6, fa5
-; RV64-i32-NEXT:    sw a6, 104(sp)
+; RV64-i32-NEXT:    flw fa5, 60(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 124(sp)
+; RV64-i32-NEXT:    flw fa5, 56(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 120(sp)
+; RV64-i32-NEXT:    flw fa5, 52(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 116(sp)
+; RV64-i32-NEXT:    flw fa5, 48(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 112(sp)
+; RV64-i32-NEXT:    flw fa5, 44(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 108(sp)
+; RV64-i32-NEXT:    flw fa5, 40(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    sw a7, 104(sp)
 ; RV64-i32-NEXT:    flw fa5, 36(sp)
-; RV64-i32-NEXT:    vfmv.f.s fa4, v10
-; RV64-i32-NEXT:    fcvt.l.s a6, fa4
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-i32-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    sw a7, 100(sp)
@@ -531,50 +533,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-i64-NEXT:    andi sp, sp, -128
 ; RV64-i64-NEXT:    addi a0, sp, 64
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v13, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v14, v8, 1
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v16, v8, 7
+; RV64-i64-NEXT:    vslidedown.vi v18, v8, 6
+; RV64-i64-NEXT:    vslidedown.vi v20, v8, 5
 ; RV64-i64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i64-NEXT:    vse32.v v8, (a0)
-; RV64-i64-NEXT:    flw fa5, 124(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v8
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 248(sp)
-; RV64-i64-NEXT:    flw fa5, 120(sp)
-; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-i64-NEXT:    fcvt.l.s a0, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    vfmv.f.s fa5, v12
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    sd a1, 240(sp)
-; RV64-i64-NEXT:    flw fa5, 116(sp)
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v14, v8, 7
-; RV64-i64-NEXT:    fcvt.l.s a1, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v11
+; RV64-i64-NEXT:    vfmv.f.s fa5, v13
 ; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    sd a2, 232(sp)
-; RV64-i64-NEXT:    flw fa5, 112(sp)
-; RV64-i64-NEXT:    fcvt.l.s a2, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v12
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    vfmv.f.s fa5, v14
 ; RV64-i64-NEXT:    fcvt.l.s a3, fa5
-; RV64-i64-NEXT:    sd a3, 224(sp)
-; RV64-i64-NEXT:    flw fa5, 108(sp)
-; RV64-i64-NEXT:    fcvt.l.s a3, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v14
-; RV64-i64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v16
 ; RV64-i64-NEXT:    fcvt.l.s a4, fa5
-; RV64-i64-NEXT:    sd a4, 216(sp)
-; RV64-i64-NEXT:    flw fa5, 104(sp)
-; RV64-i64-NEXT:    fcvt.l.s a4, fa4
-; RV64-i64-NEXT:    vfmv.f.s fa4, v10
-; RV64-i64-NEXT:    fcvt.l.s a5, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa5, v18
+; RV64-i64-NEXT:    fcvt.l.s a5, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v20
 ; RV64-i64-NEXT:    fcvt.l.s a6, fa5
-; RV64-i64-NEXT:    sd a6, 208(sp)
+; RV64-i64-NEXT:    flw fa5, 124(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 248(sp)
+; RV64-i64-NEXT:    flw fa5, 120(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 240(sp)
+; RV64-i64-NEXT:    flw fa5, 116(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 232(sp)
+; RV64-i64-NEXT:    flw fa5, 112(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 224(sp)
+; RV64-i64-NEXT:    flw fa5, 108(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 216(sp)
+; RV64-i64-NEXT:    flw fa5, 104(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    sd a7, 208(sp)
 ; RV64-i64-NEXT:    flw fa5, 100(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v12
-; RV64-i64-NEXT:    fcvt.l.s a6, fa4
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
 ; RV64-i64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    sd a7, 200(sp)
@@ -877,26 +880,27 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-i64-NEXT:    andi sp, sp, -64
 ; RV64-i64-NEXT:    mv a0, sp
+; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v14, v8, 3
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vse64.v v8, (a0)
-; RV64-i64-NEXT:    fld fa5, 56(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v8
-; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    fcvt.l.d a0, fa4
-; RV64-i64-NEXT:    fcvt.l.d a1, fa5
-; RV64-i64-NEXT:    sd a1, 120(sp)
-; RV64-i64-NEXT:    fld fa5, 48(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    fcvt.l.d a1, fa4
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v12
+; RV64-i64-NEXT:    fcvt.l.d a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v14
 ; RV64-i64-NEXT:    fcvt.l.d a2, fa5
-; RV64-i64-NEXT:    sd a2, 112(sp)
+; RV64-i64-NEXT:    fld fa5, 56(sp)
+; RV64-i64-NEXT:    fcvt.l.d a3, fa5
+; RV64-i64-NEXT:    sd a3, 120(sp)
+; RV64-i64-NEXT:    fld fa5, 48(sp)
+; RV64-i64-NEXT:    fcvt.l.d a3, fa5
+; RV64-i64-NEXT:    sd a3, 112(sp)
 ; RV64-i64-NEXT:    fld fa5, 40(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa4, v10
-; RV64-i64-NEXT:    fcvt.l.d a2, fa4
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64-i64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    sd a3, 104(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index c29ccd45528b8..a258818539258 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -141,8 +141,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmv.v.i v0, 3
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -151,8 +151,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) {
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v4i1:
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32F-NEXT:    vmv.v.i v0, 3
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 3
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -245,10 +245,10 @@ define <8 x i1> @buildvec_mask_v8i1() {
 define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: buildvec_mask_nonconst_v8i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 19
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
 ; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -256,10 +256,10 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) {
 ;
 ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    li a2, 19
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; ZVE32F-NEXT:    vmv.s.x v0, a2
 ; ZVE32F-NEXT:    vmv.v.x v8, a1
+; ZVE32F-NEXT:    li a1, 19
+; ZVE32F-NEXT:    vmv.s.x v0, a1
 ; ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -282,12 +282,12 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -299,12 +299,12 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -327,12 +327,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -344,12 +344,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -370,13 +370,13 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -386,13 +386,13 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
-; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
 ; ZVE32F-NEXT:    vmsne.vi v0, v8, 0
@@ -528,12 +528,12 @@ define <128 x i1> @buildvec_mask_v128i1() {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a0, %hi(.LCPI20_0)
 ; RV64-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v0, a0
+; RV64-NEXT:    lui a0, %hi(.LCPI20_1)
+; RV64-NEXT:    ld a0, %lo(.LCPI20_1)(a0)
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_v128i1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
index 979785dd2c024..84486a96873d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll
@@ -24,11 +24,11 @@ define void @splat_zeros_v2i1(ptr %x) {
 define void @splat_v1i1(ptr %x, i1 %y) {
 ; CHECK-LABEL: splat_v1i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    andi a1, a1, 1
+; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
@@ -48,11 +48,11 @@ define void @splat_v1i1_icmp(ptr %x, i32 signext %y, i32 signext %z) {
 ; CHECK-LABEL: splat_v1i1_icmp:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xor a1, a1, a2
-; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
@@ -84,9 +84,9 @@ define void @splat_v4i1(ptr %x, i1 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 232a364e87f0e..29e7179b65acb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -406,7 +406,6 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
-; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -415,6 +414,7 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F-NEXT:    sw a1, 0(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
 ; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64:
@@ -732,9 +732,9 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8
 ; RV64ZVE32F-NEXT:  .LBB12_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB12_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -755,9 +755,9 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB12_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB12_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -1433,9 +1433,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:  .LBB23_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB23_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1458,9 +1458,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB23_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB23_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -1582,9 +1582,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB24_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB24_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1607,9 +1607,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB24_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB24_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -1732,9 +1732,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB25_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB25_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1758,9 +1758,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB25_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB25_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -1885,9 +1885,9 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:  .LBB26_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB26_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1909,9 +1909,9 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB26_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB26_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -2149,15 +2149,15 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ;
 ; RV32ZVE32F-LABEL: mgather_v2i32_zextload_v2i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi a1, a0, 8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 12(a0)
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vse32.v v9, (a0)
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    addi a0, a0, 8
+; RV32ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64:
@@ -2480,9 +2480,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:  .LBB35_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB35_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2505,9 +2505,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB35_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB35_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -2628,9 +2628,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB36_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB36_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2653,9 +2653,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB36_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB36_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -2780,9 +2780,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB37_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB37_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2806,9 +2806,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB37_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB37_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -2937,9 +2937,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB38_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB38_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2962,9 +2962,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB38_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB38_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -3087,9 +3087,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB39_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB39_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -3112,9 +3112,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB39_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB39_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -3240,9 +3240,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB40_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB40_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -3266,9 +3266,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB40_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB40_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -3391,9 +3391,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:  .LBB41_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB41_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -3415,9 +3415,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB41_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB41_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -4109,9 +4109,9 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB48_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB48_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -4272,9 +4272,9 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:  .LBB48_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB48_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -4320,8 +4320,8 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB48_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB48_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -4386,9 +4386,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB49_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB49_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -4549,9 +4549,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB49_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB49_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -4597,8 +4597,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB49_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB49_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -4665,9 +4665,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB50_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB50_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -4830,9 +4830,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB50_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB50_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -4882,8 +4882,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB50_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB50_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -4950,10 +4950,10 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    beqz a3, .LBB51_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB51_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
@@ -5116,9 +5116,9 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB51_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB51_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -5164,8 +5164,8 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB51_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB51_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -5229,10 +5229,10 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    beqz a3, .LBB52_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB52_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
@@ -5395,9 +5395,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB52_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB52_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -5443,8 +5443,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB52_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB52_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -5510,10 +5510,10 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    beqz a3, .LBB53_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB53_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
@@ -5678,9 +5678,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB53_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB53_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -5730,8 +5730,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB53_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB53_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -5797,10 +5797,10 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB54_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB54_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -5962,9 +5962,9 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB54_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB54_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -6010,8 +6010,8 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB54_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB54_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -6074,10 +6074,10 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB55_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB55_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -6239,9 +6239,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB55_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB55_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -6287,8 +6287,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB55_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB55_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -6352,10 +6352,10 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a3, .LBB56_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB56_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
@@ -6519,9 +6519,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB56_5: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a6, a5, 4
 ; RV64ZVE32F-NEXT:    beqz a6, .LBB56_10
 ; RV64ZVE32F-NEXT:  # %bb.6: # %cond.load4
 ; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
@@ -6571,8 +6571,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
 ; RV64ZVE32F-NEXT:  .LBB56_14: # %else14
-; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi t2, a5, 64
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB56_17
 ; RV64ZVE32F-NEXT:  # %bb.15: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s t2, v8
@@ -6654,9 +6654,9 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    andi a2, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    beqz a2, .LBB57_7
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    beqz a1, .LBB57_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
@@ -7073,14 +7073,14 @@ define <4 x bfloat> @mgather_truemask_v4bf16(<4 x ptr> %ptrs, <4 x bfloat> %pass
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
-; RV64ZVE32F-NEXT:    lh a1, 0(a1)
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
+; RV64ZVE32F-NEXT:    lh a1, 0(a1)
 ; RV64ZVE32F-NEXT:    lh a3, 0(a3)
-; RV64ZVE32F-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x bfloat> %passthru)
@@ -7271,9 +7271,9 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x
 ; RV64ZVE32F-NEXT:  .LBB64_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB64_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -7296,9 +7296,9 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB64_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB64_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -7420,9 +7420,9 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB65_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB65_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -7445,9 +7445,9 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB65_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB65_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -7570,9 +7570,9 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB66_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB66_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -7596,9 +7596,9 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB66_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB66_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -7723,9 +7723,9 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:  .LBB67_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB67_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -7747,9 +7747,9 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB67_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB67_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -8085,14 +8085,14 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ld a0, 24(a0)
-; RV64ZVE32F-ZVFHMIN-NEXT:    lh a1, 0(a1)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    lh a2, 0(a2)
+; RV64ZVE32F-ZVFHMIN-NEXT:    lh a1, 0(a1)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    lh a3, 0(a3)
-; RV64ZVE32F-ZVFHMIN-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-ZVFHMIN-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x half> %passthru)
@@ -8376,9 +8376,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB74_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB74_14
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -8401,9 +8401,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB74_9: # %else14
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    beqz a2, .LBB74_11
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFH-NEXT:    vmv.x.s a2, v8
@@ -8500,9 +8500,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB74_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB74_14
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8525,9 +8525,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB74_9: # %else14
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB74_11
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v8
@@ -8649,9 +8649,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB75_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB75_14
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -8674,9 +8674,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB75_9: # %else14
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    beqz a2, .LBB75_11
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFH-NEXT:    vmv.x.s a2, v8
@@ -8773,9 +8773,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB75_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB75_14
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8798,9 +8798,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB75_9: # %else14
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB75_11
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v8
@@ -8923,9 +8923,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB76_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB76_14
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -8949,9 +8949,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB76_9: # %else14
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    beqz a2, .LBB76_11
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFH-NEXT:    vmv.x.s a2, v8
@@ -9055,9 +9055,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB76_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB76_14
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -9081,9 +9081,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB76_9: # %else14
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB76_11
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v8
@@ -9208,9 +9208,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB77_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB77_14
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -9232,9 +9232,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB77_9: # %else14
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    beqz a2, .LBB77_11
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFH-NEXT:    vmv.x.s a2, v8
@@ -9324,9 +9324,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB77_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB77_14
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -9348,9 +9348,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB77_9: # %else14
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB77_11
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v8
@@ -9791,9 +9791,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB84_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB84_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9816,9 +9816,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB84_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB84_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -9939,9 +9939,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB85_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB85_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9964,9 +9964,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB85_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB85_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -10091,9 +10091,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 ; RV64ZVE32F-NEXT:  .LBB86_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB86_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10117,9 +10117,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB86_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB86_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -10248,9 +10248,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
 ; RV64ZVE32F-NEXT:  .LBB87_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB87_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10273,9 +10273,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB87_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB87_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -10398,9 +10398,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB88_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB88_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10423,9 +10423,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB88_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB88_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -10551,9 +10551,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB89_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB89_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10577,9 +10577,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 5
 ; RV64ZVE32F-NEXT:  .LBB89_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB89_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -10702,9 +10702,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
 ; RV64ZVE32F-NEXT:  .LBB90_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB90_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10726,9 +10726,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB90_9: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB90_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -11308,9 +11308,9 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB97_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB97_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB97_11
@@ -11420,9 +11420,9 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
 ; RV64ZVE32F-NEXT:  .LBB97_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB97_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -11440,8 +11440,8 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB97_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB97_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -11523,9 +11523,9 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB98_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB98_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB98_11
@@ -11635,9 +11635,9 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB98_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB98_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -11655,8 +11655,8 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB98_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB98_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -11740,9 +11740,9 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB99_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB99_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB99_11
@@ -11854,9 +11854,9 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB99_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB99_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -11875,8 +11875,8 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB99_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB99_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -11963,10 +11963,10 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a3, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a2, v8
-; RV32ZVE32F-NEXT:    bnez a3, .LBB100_10
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a2, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB100_11
@@ -12078,9 +12078,9 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB100_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB100_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -12098,8 +12098,8 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB100_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB100_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -12180,10 +12180,10 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a3, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a2, v8
-; RV32ZVE32F-NEXT:    bnez a3, .LBB101_10
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a2, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB101_11
@@ -12295,9 +12295,9 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV64ZVE32F-NEXT:  .LBB101_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB101_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -12315,8 +12315,8 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB101_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB101_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -12399,10 +12399,10 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a3, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a2, v8
-; RV32ZVE32F-NEXT:    bnez a3, .LBB102_10
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a2, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB102_11
@@ -12516,9 +12516,9 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV64ZVE32F-NEXT:  .LBB102_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB102_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -12537,8 +12537,8 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB102_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB102_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -12624,10 +12624,10 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB103_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB103_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB103_11
@@ -12738,9 +12738,9 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB103_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB103_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -12758,8 +12758,8 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB103_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB103_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -12839,10 +12839,10 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB104_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB104_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB104_11
@@ -12953,9 +12953,9 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:  .LBB104_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB104_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -12973,8 +12973,8 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB104_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB104_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -13055,10 +13055,10 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB105_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB105_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB105_11
@@ -13171,9 +13171,9 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:  .LBB105_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 4
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB105_14
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a3, a2, 8
@@ -13192,8 +13192,8 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:    add a3, a1, a3
 ; RV64ZVE32F-NEXT:    fld fa5, 0(a3)
 ; RV64ZVE32F-NEXT:  .LBB105_9: # %else14
-; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a3, a2, 64
 ; RV64ZVE32F-NEXT:    beqz a3, .LBB105_11
 ; RV64ZVE32F-NEXT:  # %bb.10: # %cond.load16
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -13295,9 +13295,9 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez a3, .LBB106_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB106_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB106_11
@@ -13528,9 +13528,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB107_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB107_25
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -13546,9 +13546,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:    vmv.s.x v11, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 4
 ; RV64ZVE32F-NEXT:  .LBB107_8: # %else11
-; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB107_10
 ; RV64ZVE32F-NEXT:  # %bb.9: # %cond.load13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -13560,9 +13560,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 5
 ; RV64ZVE32F-NEXT:  .LBB107_10: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB107_27
 ; RV64ZVE32F-NEXT:  # %bb.11: # %else17
 ; RV64ZVE32F-NEXT:    andi a2, a1, 128
@@ -13585,9 +13585,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB107_15: # %else26
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB107_30
 ; RV64ZVE32F-NEXT:  # %bb.16: # %else29
 ; RV64ZVE32F-NEXT:    slli a2, a1, 52
@@ -13608,9 +13608,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 13
 ; RV64ZVE32F-NEXT:  .LBB107_20: # %else38
-; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB107_22
 ; RV64ZVE32F-NEXT:  # %bb.21: # %cond.load40
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -13741,15 +13741,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64V-NEXT:    vsext.vf8 v16, v8
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v10, 16
-; RV64V-NEXT:    vslidedown.vi v14, v8, 16
-; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64V-NEXT:    vslidedown.vi v8, v0, 2
+; RV64V-NEXT:    vslidedown.vi v8, v8, 16
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v10, (a0), v16, v0.t
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64V-NEXT:    vsext.vf8 v16, v14
-; RV64V-NEXT:    vmv1r.v v0, v8
-; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; RV64V-NEXT:    vsext.vf8 v16, v8
+; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64V-NEXT:    vslidedown.vi v0, v0, 2
+; RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    li a0, 32
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -13784,9 +13783,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB108_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB108_49
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -13802,9 +13801,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 4
 ; RV64ZVE32F-NEXT:  .LBB108_8: # %else11
-; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB108_10
 ; RV64ZVE32F-NEXT:  # %bb.9: # %cond.load13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -13816,9 +13815,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v14, 5
 ; RV64ZVE32F-NEXT:  .LBB108_10: # %else14
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v13, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB108_51
 ; RV64ZVE32F-NEXT:  # %bb.11: # %else17
 ; RV64ZVE32F-NEXT:    andi a2, a1, 128
@@ -13841,9 +13840,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB108_15: # %else26
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v12, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB108_17
 ; RV64ZVE32F-NEXT:  # %bb.16: # %cond.load28
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v12
@@ -13865,9 +13864,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 11
 ; RV64ZVE32F-NEXT:  .LBB108_19: # %else32
-; RV64ZVE32F-NEXT:    slli a2, a1, 51
 ; RV64ZVE32F-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 16
+; RV64ZVE32F-NEXT:    slli a2, a1, 51
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB108_21
 ; RV64ZVE32F-NEXT:  # %bb.20: # %cond.load34
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v13
@@ -13889,9 +13888,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v9, 13
 ; RV64ZVE32F-NEXT:  .LBB108_23: # %else38
-; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v13, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB108_54
 ; RV64ZVE32F-NEXT:  # %bb.24: # %else41
 ; RV64ZVE32F-NEXT:    slli a2, a1, 48
@@ -13914,9 +13913,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB108_28: # %else50
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    slli a2, a1, 45
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 45
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB108_57
 ; RV64ZVE32F-NEXT:  # %bb.29: # %else53
 ; RV64ZVE32F-NEXT:    slli a2, a1, 44
@@ -13932,9 +13931,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 20
 ; RV64ZVE32F-NEXT:  .LBB108_32: # %else59
-; RV64ZVE32F-NEXT:    slli a2, a1, 42
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 8
+; RV64ZVE32F-NEXT:    slli a2, a1, 42
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB108_34
 ; RV64ZVE32F-NEXT:  # %bb.33: # %cond.load61
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -13946,9 +13945,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 22, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 21
 ; RV64ZVE32F-NEXT:  .LBB108_34: # %else62
-; RV64ZVE32F-NEXT:    slli a2, a1, 41
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 41
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB108_59
 ; RV64ZVE32F-NEXT:  # %bb.35: # %else65
 ; RV64ZVE32F-NEXT:    slli a2, a1, 40
@@ -13971,9 +13970,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:  .LBB108_39: # %else74
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    slli a2, a1, 37
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 37
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB108_62
 ; RV64ZVE32F-NEXT:  # %bb.40: # %else77
 ; RV64ZVE32F-NEXT:    slli a2, a1, 36
@@ -13994,9 +13993,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    vsetivli zero, 30, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 29
 ; RV64ZVE32F-NEXT:  .LBB108_44: # %else86
-; RV64ZVE32F-NEXT:    slli a2, a1, 33
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 33
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB108_46
 ; RV64ZVE32F-NEXT:  # %bb.45: # %cond.load88
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
@@ -14279,8 +14278,8 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, -512
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vmv.v.x v8, a1
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
 ; RV32-NEXT:    vluxei32.v v8, (a0), v8
 ; RV32-NEXT:    ret
@@ -14288,10 +14287,11 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) {
 ; RV64V-LABEL: mgather_narrow_edge_case:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    li a1, -512
+; RV64V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64V-NEXT:    vmv.v.x v8, a1
 ; RV64V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 5
 ; RV64V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v8, a1
 ; RV64V-NEXT:    vmerge.vim v10, v8, 0, v0
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64V-NEXT:    vluxei64.v v8, (a0), v10
@@ -14302,8 +14302,8 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) {
 ; RV64ZVE32F-NEXT:    lw a1, -512(a0)
 ; RV64ZVE32F-NEXT:    lw a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 5
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 5
 ; RV64ZVE32F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i32, ptr %base, <4 x i8>  <i8 0, i8 128, i8 0, i8 128>
@@ -14337,36 +14337,36 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    lbu a0, 1(a0)
 ; RV32-NEXT:    vmv.x.s a7, v10
 ; RV32-NEXT:    vmv.x.s t0, v8
-; RV32-NEXT:    lbu t1, 0(a1)
-; RV32-NEXT:    lbu a1, 1(a1)
-; RV32-NEXT:    lbu t2, 0(a2)
-; RV32-NEXT:    lbu a2, 1(a2)
 ; RV32-NEXT:    slli a0, a0, 8
 ; RV32-NEXT:    or a0, a0, a6
-; RV32-NEXT:    lbu a6, 0(a3)
-; RV32-NEXT:    lbu a3, 1(a3)
+; RV32-NEXT:    lbu a6, 0(a1)
+; RV32-NEXT:    lbu a1, 1(a1)
 ; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    or a1, a1, t1
-; RV32-NEXT:    lbu t1, 0(a4)
-; RV32-NEXT:    lbu a4, 1(a4)
+; RV32-NEXT:    or a1, a1, a6
+; RV32-NEXT:    lbu a6, 0(a2)
+; RV32-NEXT:    lbu a2, 1(a2)
 ; RV32-NEXT:    slli a2, a2, 8
-; RV32-NEXT:    or a2, a2, t2
-; RV32-NEXT:    lbu t2, 0(a5)
-; RV32-NEXT:    lbu a5, 1(a5)
+; RV32-NEXT:    or a2, a2, a6
+; RV32-NEXT:    lbu a6, 0(a3)
+; RV32-NEXT:    lbu a3, 1(a3)
 ; RV32-NEXT:    slli a3, a3, 8
 ; RV32-NEXT:    or a3, a3, a6
-; RV32-NEXT:    lbu a6, 0(a7)
-; RV32-NEXT:    lbu a7, 1(a7)
+; RV32-NEXT:    lbu a6, 0(a4)
+; RV32-NEXT:    lbu a4, 1(a4)
 ; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    or a4, a4, t1
-; RV32-NEXT:    lbu t1, 0(t0)
-; RV32-NEXT:    lbu t0, 1(t0)
+; RV32-NEXT:    or a4, a4, a6
+; RV32-NEXT:    lbu a6, 0(a5)
+; RV32-NEXT:    lbu a5, 1(a5)
 ; RV32-NEXT:    slli a5, a5, 8
-; RV32-NEXT:    or a5, a5, t2
+; RV32-NEXT:    or a5, a5, a6
+; RV32-NEXT:    lbu a6, 0(a7)
+; RV32-NEXT:    lbu a7, 1(a7)
 ; RV32-NEXT:    slli a7, a7, 8
 ; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    lbu a7, 0(t0)
+; RV32-NEXT:    lbu t0, 1(t0)
 ; RV32-NEXT:    slli t0, t0, 8
-; RV32-NEXT:    or a7, t0, t1
+; RV32-NEXT:    or a7, t0, a7
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
@@ -14375,8 +14375,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    vslide1down.vx v9, v9, a5
 ; RV32-NEXT:    vslide1down.vx v10, v8, a3
 ; RV32-NEXT:    vslide1down.vx v8, v9, a6
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-NEXT:    ret
 ;
@@ -14450,8 +14450,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64V-NEXT:    vmv.v.x v8, a3
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-NEXT:    vmv.v.i v0, 15
 ; RV64V-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-NEXT:    vmv.v.i v0, 15
 ; RV64V-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64V-NEXT:    addi sp, s0, -128
 ; RV64V-NEXT:    .cfi_def_cfa sp, 128
@@ -14475,38 +14475,38 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    lbu t0, 13(a0)
 ; RV64ZVE32F-NEXT:    slli a2, a2, 8
 ; RV64ZVE32F-NEXT:    slli a4, a4, 8
+; RV64ZVE32F-NEXT:    slli a6, a6, 8
 ; RV64ZVE32F-NEXT:    or a1, a2, a1
 ; RV64ZVE32F-NEXT:    or a3, a4, a3
-; RV64ZVE32F-NEXT:    lbu a2, 16(a0)
-; RV64ZVE32F-NEXT:    lbu a4, 17(a0)
-; RV64ZVE32F-NEXT:    lbu t1, 20(a0)
-; RV64ZVE32F-NEXT:    lbu t2, 21(a0)
-; RV64ZVE32F-NEXT:    slli a6, a6, 8
-; RV64ZVE32F-NEXT:    or a5, a6, a5
+; RV64ZVE32F-NEXT:    or a2, a6, a5
+; RV64ZVE32F-NEXT:    lbu a4, 16(a0)
+; RV64ZVE32F-NEXT:    lbu a5, 17(a0)
+; RV64ZVE32F-NEXT:    lbu a6, 20(a0)
+; RV64ZVE32F-NEXT:    lbu t1, 21(a0)
 ; RV64ZVE32F-NEXT:    slli t0, t0, 8
-; RV64ZVE32F-NEXT:    slli a4, a4, 8
-; RV64ZVE32F-NEXT:    slli t2, t2, 8
-; RV64ZVE32F-NEXT:    or a6, t0, a7
-; RV64ZVE32F-NEXT:    or a2, a4, a2
-; RV64ZVE32F-NEXT:    lbu a4, 24(a0)
-; RV64ZVE32F-NEXT:    lbu a7, 25(a0)
-; RV64ZVE32F-NEXT:    or t0, t2, t1
+; RV64ZVE32F-NEXT:    slli a5, a5, 8
+; RV64ZVE32F-NEXT:    slli t1, t1, 8
+; RV64ZVE32F-NEXT:    or a7, t0, a7
+; RV64ZVE32F-NEXT:    or a4, a5, a4
+; RV64ZVE32F-NEXT:    or a5, t1, a6
+; RV64ZVE32F-NEXT:    lbu a6, 24(a0)
+; RV64ZVE32F-NEXT:    lbu t0, 25(a0)
 ; RV64ZVE32F-NEXT:    lbu t1, 28(a0)
 ; RV64ZVE32F-NEXT:    lbu a0, 29(a0)
-; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    or a4, a7, a4
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    slli t0, t0, 8
+; RV64ZVE32F-NEXT:    or a6, t0, a6
 ; RV64ZVE32F-NEXT:    slli a0, a0, 8
 ; RV64ZVE32F-NEXT:    or a0, a0, t1
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vmv.v.x v9, a2
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, t0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -14541,7 +14541,6 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 24(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 26(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -14550,6 +14549,7 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -14586,7 +14586,6 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 28(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 30(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -14595,6 +14594,7 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 2, i64 3, i64 6, i64 7, i64 10, i64 11, i64 14, i64 15>
@@ -14631,7 +14631,6 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
@@ -14640,6 +14639,7 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 12, i64 13, i64 10, i64 11, i64 8, i64 9>
@@ -14676,7 +14676,6 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
@@ -14685,6 +14684,7 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 10, i64 11, i64 6, i64 7, i64 2, i64 3>
@@ -14720,7 +14720,6 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
@@ -14729,6 +14728,7 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 2, i32 3>
@@ -14767,7 +14767,6 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
@@ -14776,6 +14775,7 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
@@ -14806,23 +14806,23 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 8(a0)
-; RV64ZVE32F-NEXT:    lh a5, 10(a0)
-; RV64ZVE32F-NEXT:    lh a6, 18(a0)
-; RV64ZVE32F-NEXT:    lh a0, 20(a0)
+; RV64ZVE32F-NEXT:    lh a1, 10(a0)
+; RV64ZVE32F-NEXT:    lh a2, 18(a0)
+; RV64ZVE32F-NEXT:    lh a3, 20(a0)
+; RV64ZVE32F-NEXT:    lh a4, 2(a0)
+; RV64ZVE32F-NEXT:    lh a5, 4(a0)
+; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    lh a0, 8(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vmv.v.x v9, a4
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 1, i32 2, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
@@ -14865,7 +14865,6 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -14874,6 +14873,7 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -14913,7 +14913,6 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -14922,6 +14921,7 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -14970,7 +14970,6 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -14979,6 +14978,7 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>
@@ -15018,7 +15018,6 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
 ; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
@@ -15027,6 +15026,7 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 0, i64 2, i64 3, i64 1, i64 4, i64 5, i64 6, i64 7>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index f72b08a405246..f27c8e5d664e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -242,9 +242,9 @@ define <32 x double> @masked_load_v32f64(ptr %a, <32 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -278,12 +278,12 @@ define <64 x float> @masked_load_v64f32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x float> @llvm.masked.load.v64f32(ptr %a, i32 8, <64 x i1> %mask, <64 x float> undef)
@@ -294,12 +294,12 @@ define <128 x bfloat> @masked_load_v128bf16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x bfloat> @llvm.masked.load.v128bf16(ptr %a, i32 8, <128 x i1> %mask, <128 x bfloat> undef)
@@ -310,12 +310,12 @@ define <128 x half> @masked_load_v128f16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x half> @llvm.masked.load.v128f16(ptr %a, i32 8, <128 x i1> %mask, <128 x half> undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 69903d77084bf..6e613917f8cd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -240,9 +240,9 @@ define <32 x i64> @masked_load_v32i64(ptr %a, <32 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -276,12 +276,12 @@ define <64 x i32> @masked_load_v64i32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x i32> @llvm.masked.load.v64i32(ptr %a, i32 8, <64 x i1> %mask, <64 x i32> undef)
@@ -303,12 +303,12 @@ define <128 x i16> @masked_load_v128i16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x i16> @llvm.masked.load.v128i16(ptr %a, i32 8, <128 x i1> %mask, <128 x i16> undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 7354f9afa9a71..7358fd4cfa0f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -123,9 +123,9 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x ptr> %ptrs, <2
 ; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV64ZVE32F-NEXT:    andi a3, a2, 1
-; RV64ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB2_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a2, 2
@@ -181,8 +181,8 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x ptr> %ptrs, <2
 ; RV64ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
 ; RV64ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVE32F-NEXT:    andi a3, a2, 1
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB3_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a2, 2
@@ -229,11 +229,11 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x ptr> %ptrs, <2
 ;
 ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i8:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 8(a0)
-; RV32ZVE32F-NEXT:    lw a0, 0(a0)
+; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw a0, 8(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32ZVE32F-NEXT:    vmv.s.x v9, a1
-; RV32ZVE32F-NEXT:    vmv.s.x v10, a0
+; RV32ZVE32F-NEXT:    vmv.s.x v9, a0
+; RV32ZVE32F-NEXT:    vmv.s.x v10, a1
 ; RV32ZVE32F-NEXT:    vslideup.vi v10, v9, 1
 ; RV32ZVE32F-NEXT:    vsoxei32.v v10, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    ret
@@ -244,8 +244,8 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x ptr> %ptrs, <2
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a1
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV64ZVE32F-NEXT:    andi a1, a0, 1
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; RV64ZVE32F-NEXT:    andi a1, a0, 1
 ; RV64ZVE32F-NEXT:    bnez a1, .LBB4_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a0, a0, 2
@@ -513,9 +513,9 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:  .LBB9_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB9_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -535,9 +535,9 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB9_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB9_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -698,11 +698,11 @@ define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x ptr> %ptrs, <2
 ;
 ; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV64ZVE32F-NEXT:    andi a3, a2, 1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV64ZVE32F-NEXT:    andi a3, a2, 1
 ; RV64ZVE32F-NEXT:    bnez a3, .LBB12_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a2, 2
@@ -745,11 +745,11 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x ptr> %ptrs, <2
 ;
 ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i16:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 8(a0)
-; RV32ZVE32F-NEXT:    lw a0, 0(a0)
+; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw a0, 8(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.s.x v9, a1
-; RV32ZVE32F-NEXT:    vmv.s.x v10, a0
+; RV32ZVE32F-NEXT:    vmv.s.x v9, a0
+; RV32ZVE32F-NEXT:    vmv.s.x v10, a1
 ; RV32ZVE32F-NEXT:    vslideup.vi v10, v9, 1
 ; RV32ZVE32F-NEXT:    vsoxei32.v v10, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    ret
@@ -761,9 +761,9 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x ptr> %ptrs, <2
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV64ZVE32F-NEXT:    andi a1, a0, 1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; RV64ZVE32F-NEXT:    andi a1, a0, 1
 ; RV64ZVE32F-NEXT:    bnez a1, .LBB13_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a0, a0, 2
@@ -1035,9 +1035,9 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:  .LBB18_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB18_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1058,9 +1058,9 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB18_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB18_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -1168,9 +1168,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB19_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB19_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1191,9 +1191,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB19_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB19_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -1302,9 +1302,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB20_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB20_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1326,9 +1326,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB20_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB20_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -1440,9 +1440,9 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB21_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB21_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1463,9 +1463,9 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB21_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB21_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -1638,10 +1638,10 @@ define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x ptr> %ptrs, <2
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a0
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV64ZVE32F-NEXT:    andi a4, a0, 1
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    bnez a4, .LBB24_3
+; RV64ZVE32F-NEXT:    andi a1, a0, 1
+; RV64ZVE32F-NEXT:    bnez a1, .LBB24_3
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a0, a0, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB24_4
@@ -1915,9 +1915,9 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:  .LBB29_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB29_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -1939,9 +1939,9 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB29_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB29_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2052,9 +2052,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB30_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB30_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2076,9 +2076,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB30_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB30_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2193,9 +2193,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB31_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB31_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2218,9 +2218,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB31_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB31_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2339,9 +2339,9 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:  .LBB32_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB32_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2363,9 +2363,9 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB32_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB32_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2478,9 +2478,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB33_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB33_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2502,9 +2502,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB33_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB33_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2620,9 +2620,9 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB34_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB34_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2645,9 +2645,9 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB34_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB34_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -2761,9 +2761,9 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB35_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB35_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -2785,9 +2785,9 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
 ; RV64ZVE32F-NEXT:  .LBB35_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB35_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -3425,13 +3425,13 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB42_10
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB42_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB42_11
@@ -3560,9 +3560,9 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:  .LBB42_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -3580,8 +3580,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB42_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB42_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -3675,13 +3675,13 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB43_10
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB43_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB43_11
@@ -3810,9 +3810,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB43_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -3830,8 +3830,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB43_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB43_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -3927,13 +3927,13 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    lw t6, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB44_10
+; RV32ZVE32F-NEXT:    andi a1, t0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB44_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB44_11
@@ -4032,7 +4032,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
@@ -4040,8 +4040,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    ld a7, 24(a0)
 ; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB44_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -4051,7 +4051,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB44_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -4064,18 +4064,18 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:  .LBB44_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_13
 ; RV64ZVE32F-NEXT:  .LBB44_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_14
 ; RV64ZVE32F-NEXT:  .LBB44_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_9
 ; RV64ZVE32F-NEXT:  .LBB44_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
@@ -4083,13 +4083,13 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB44_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_16
 ; RV64ZVE32F-NEXT:  .LBB44_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4099,7 +4099,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_6
 ; RV64ZVE32F-NEXT:  .LBB44_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4108,7 +4108,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_7
 ; RV64ZVE32F-NEXT:  .LBB44_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
@@ -4116,7 +4116,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB44_8
 ; RV64ZVE32F-NEXT:    j .LBB44_9
 ; RV64ZVE32F-NEXT:  .LBB44_15: # %cond.store11
@@ -4125,7 +4125,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV64ZVE32F-NEXT:    slli a0, a0, 3
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB44_11
 ; RV64ZVE32F-NEXT:  .LBB44_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4186,13 +4186,13 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
-; RV32ZVE32F-NEXT:    li s1, 8
+; RV32ZVE32F-NEXT:    li a1, 8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi s2, a1, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVE32F-NEXT:    vwmaccus.vx v10, s1, v8
-; RV32ZVE32F-NEXT:    bnez s2, .LBB45_10
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB45_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_11
@@ -4323,9 +4323,9 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:  .LBB45_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -4343,8 +4343,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB45_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB45_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -4438,13 +4438,13 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
-; RV32ZVE32F-NEXT:    li s1, 8
+; RV32ZVE32F-NEXT:    li a1, 8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi s2, a1, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVE32F-NEXT:    vwmaccus.vx v10, s1, v8
-; RV32ZVE32F-NEXT:    bnez s2, .LBB46_10
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB46_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_11
@@ -4575,9 +4575,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB46_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -4595,8 +4595,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB46_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB46_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -4692,13 +4692,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
-; RV32ZVE32F-NEXT:    li s1, 8
+; RV32ZVE32F-NEXT:    li a1, 8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi s2, a1, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32ZVE32F-NEXT:    vwmaccu.vx v10, s1, v8
-; RV32ZVE32F-NEXT:    bnez s2, .LBB47_10
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB47_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_11
@@ -4798,7 +4798,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 48(a0)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
@@ -4806,8 +4806,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    ld a7, 24(a0)
 ; RV64ZVE32F-NEXT:    ld a6, 32(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
-; RV64ZVE32F-NEXT:    andi t2, a4, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV64ZVE32F-NEXT:    andi t2, a5, 1
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB47_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
@@ -4818,7 +4818,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    sd a0, 0(t2)
 ; RV64ZVE32F-NEXT:  .LBB47_2: # %else
-; RV64ZVE32F-NEXT:    andi a0, a4, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV64ZVE32F-NEXT:  # %bb.3: # %cond.store1
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
@@ -4831,18 +4831,18 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB47_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a4, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_13
 ; RV64ZVE32F-NEXT:  .LBB47_6: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_14
 ; RV64ZVE32F-NEXT:  .LBB47_7: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_8: # %cond.store9
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
@@ -4850,13 +4850,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    slli a0, a0, 48
 ; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
-; RV64ZVE32F-NEXT:    sd a5, 0(a0)
+; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB47_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a4, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_16
 ; RV64ZVE32F-NEXT:  .LBB47_11: # %else14
 ; RV64ZVE32F-NEXT:    ret
@@ -4866,7 +4866,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd t0, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 8
+; RV64ZVE32F-NEXT:    andi a0, a5, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV64ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4875,7 +4875,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a7, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 16
+; RV64ZVE32F-NEXT:    andi a0, a5, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV64ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v9
@@ -4883,7 +4883,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a6, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, 32
+; RV64ZVE32F-NEXT:    andi a0, a5, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV64ZVE32F-NEXT:    j .LBB47_9
 ; RV64ZVE32F-NEXT:  .LBB47_15: # %cond.store11
@@ -4892,7 +4892,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    srli a0, a0, 45
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a3, 0(a0)
-; RV64ZVE32F-NEXT:    andi a0, a4, -128
+; RV64ZVE32F-NEXT:    andi a0, a5, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB47_11
 ; RV64ZVE32F-NEXT:  .LBB47_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
@@ -4943,42 +4943,41 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB48_10
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB48_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_11
 ; RV32ZVE32F-NEXT:  .LBB48_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_12
 ; RV32ZVE32F-NEXT:  .LBB48_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_13
 ; RV32ZVE32F-NEXT:  .LBB48_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_14
 ; RV32ZVE32F-NEXT:  .LBB48_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_15
 ; RV32ZVE32F-NEXT:  .LBB48_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_16
 ; RV32ZVE32F-NEXT:  .LBB48_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_9
 ; RV32ZVE32F-NEXT:  .LBB48_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -4998,44 +4997,45 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB48_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_2
 ; RV32ZVE32F-NEXT:  .LBB48_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_3
 ; RV32ZVE32F-NEXT:  .LBB48_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5043,7 +5043,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_7
 ; RV32ZVE32F-NEXT:  .LBB48_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5051,7 +5051,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_8
 ; RV32ZVE32F-NEXT:    j .LBB48_9
 ;
@@ -5088,9 +5088,9 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:  .LBB48_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -5108,8 +5108,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB48_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB48_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -5193,42 +5193,41 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB49_10
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB49_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_11
 ; RV32ZVE32F-NEXT:  .LBB49_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_12
 ; RV32ZVE32F-NEXT:  .LBB49_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_13
 ; RV32ZVE32F-NEXT:  .LBB49_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_14
 ; RV32ZVE32F-NEXT:  .LBB49_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_15
 ; RV32ZVE32F-NEXT:  .LBB49_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_16
 ; RV32ZVE32F-NEXT:  .LBB49_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_9
 ; RV32ZVE32F-NEXT:  .LBB49_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5248,44 +5247,45 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB49_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_2
 ; RV32ZVE32F-NEXT:  .LBB49_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_3
 ; RV32ZVE32F-NEXT:  .LBB49_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5293,7 +5293,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_7
 ; RV32ZVE32F-NEXT:  .LBB49_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5301,7 +5301,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_8
 ; RV32ZVE32F-NEXT:    j .LBB49_9
 ;
@@ -5338,9 +5338,9 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB49_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -5358,8 +5358,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB49_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB49_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -5444,42 +5444,41 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB50_10
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s1, a1, 1
+; RV32ZVE32F-NEXT:    bnez s1, .LBB50_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_11
 ; RV32ZVE32F-NEXT:  .LBB50_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_12
 ; RV32ZVE32F-NEXT:  .LBB50_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_13
 ; RV32ZVE32F-NEXT:  .LBB50_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_14
 ; RV32ZVE32F-NEXT:  .LBB50_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_15
 ; RV32ZVE32F-NEXT:  .LBB50_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_16
 ; RV32ZVE32F-NEXT:  .LBB50_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_9
 ; RV32ZVE32F-NEXT:  .LBB50_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5499,44 +5498,45 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB50_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_2
 ; RV32ZVE32F-NEXT:  .LBB50_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_3
 ; RV32ZVE32F-NEXT:  .LBB50_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5544,7 +5544,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_7
 ; RV32ZVE32F-NEXT:  .LBB50_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -5552,7 +5552,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_8
 ; RV32ZVE32F-NEXT:    j .LBB50_9
 ;
@@ -5591,9 +5591,9 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:  .LBB50_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a0, a5, 8
@@ -5612,8 +5612,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV64ZVE32F-NEXT:    add a0, a1, a0
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:  .LBB50_9: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a0, a5, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB50_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a0, a5, -128
@@ -5745,9 +5745,9 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    andi s2, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    bnez s2, .LBB51_10
+; RV32ZVE32F-NEXT:    andi a1, a2, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB51_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a2, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB51_11
@@ -5928,8 +5928,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB51_10: # %cond.store
 ; RV64ZVE32F-NEXT:    .cfi_restore_state
-; RV64ZVE32F-NEXT:    ld a2, 0(a2)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a2)
 ; RV64ZVE32F-NEXT:    slli a2, a2, 3
 ; RV64ZVE32F-NEXT:    add a2, a1, a2
 ; RV64ZVE32F-NEXT:    sd a0, 0(a2)
@@ -6350,9 +6350,9 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB58_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB58_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6375,9 +6375,9 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB58_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6501,9 +6501,9 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB59_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB59_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6526,9 +6526,9 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB59_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6653,9 +6653,9 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB60_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB60_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6679,9 +6679,9 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB60_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6809,9 +6809,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  .LBB61_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB61_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6834,9 +6834,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB61_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -7453,9 +7453,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB68_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB68_12
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -7476,9 +7476,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB68_9: # %else10
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB68_15
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFH-NEXT:    andi a1, a1, -128
@@ -7567,9 +7567,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB68_12
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7592,9 +7592,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_9: # %else10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB68_15
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7714,9 +7714,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB69_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB69_12
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -7737,9 +7737,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB69_9: # %else10
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB69_15
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFH-NEXT:    andi a1, a1, -128
@@ -7828,9 +7828,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB69_12
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7853,9 +7853,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_9: # %else10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB69_15
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7976,9 +7976,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB70_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB70_12
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -8000,9 +8000,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB70_9: # %else10
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB70_15
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFH-NEXT:    andi a1, a1, -128
@@ -8098,9 +8098,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB70_12
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8124,9 +8124,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_9: # %else10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB70_15
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8250,9 +8250,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB71_4: # %else2
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB71_12
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 8
@@ -8273,9 +8273,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:  .LBB71_9: # %else10
-; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFH-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFH-NEXT:    bnez a2, .LBB71_15
 ; RV64ZVE32F-ZVFH-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFH-NEXT:    andi a1, a1, -128
@@ -8363,9 +8363,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_4: # %else2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB71_12
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8388,9 +8388,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_9: # %else10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-ZVFHMIN-NEXT:    bnez a2, .LBB71_15
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8795,9 +8795,9 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %
 ; RV64ZVE32F-NEXT:  .LBB78_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB78_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -8819,9 +8819,9 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB78_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB78_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -8932,9 +8932,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB79_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB79_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -8956,9 +8956,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB79_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB79_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -9073,9 +9073,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB80_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB80_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9098,9 +9098,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB80_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB80_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -9219,9 +9219,9 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16>
 ; RV64ZVE32F-NEXT:  .LBB81_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB81_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9243,9 +9243,9 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB81_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB81_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -9358,9 +9358,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB82_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB82_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9382,9 +9382,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB82_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB82_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -9500,9 +9500,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB83_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB83_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9525,9 +9525,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB83_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB83_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -9641,9 +9641,9 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:  .LBB84_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB84_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -9665,9 +9665,9 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse32.v v10, (a2)
 ; RV64ZVE32F-NEXT:  .LBB84_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB84_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -10179,9 +10179,9 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB91_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB91_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB91_10
@@ -10283,9 +10283,9 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB91_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB91_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10303,8 +10303,8 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB91_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB91_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -10379,9 +10379,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB92_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB92_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB92_10
@@ -10483,9 +10483,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB92_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB92_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10503,8 +10503,8 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB92_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB92_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -10581,9 +10581,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB93_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB93_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB93_10
@@ -10687,9 +10687,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:  .LBB93_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB93_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10708,8 +10708,8 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB93_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB93_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -10789,10 +10789,10 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV32ZVE32F-NEXT:    andi a2, a0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    bnez a2, .LBB94_9
+; RV32ZVE32F-NEXT:    andi a1, a0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a0, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB94_10
@@ -10896,9 +10896,9 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV64ZVE32F-NEXT:  .LBB94_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB94_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -10916,8 +10916,8 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB94_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB94_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -10991,10 +10991,10 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV32ZVE32F-NEXT:    andi a2, a0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    bnez a2, .LBB95_9
+; RV32ZVE32F-NEXT:    andi a1, a0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a0, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB95_10
@@ -11098,9 +11098,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB95_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB95_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -11118,8 +11118,8 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB95_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB95_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -11195,10 +11195,10 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
-; RV32ZVE32F-NEXT:    andi a2, a0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a1, v8
-; RV32ZVE32F-NEXT:    bnez a2, .LBB96_9
+; RV32ZVE32F-NEXT:    andi a1, a0, 1
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a1, a0, 2
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB96_10
@@ -11304,9 +11304,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB96_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB96_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -11325,8 +11325,8 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB96_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB96_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -11405,10 +11405,10 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB97_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB97_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB97_10
@@ -11511,9 +11511,9 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32
 ; RV64ZVE32F-NEXT:  .LBB97_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB97_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -11531,8 +11531,8 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB97_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB97_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -11605,10 +11605,10 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB98_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB98_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB98_10
@@ -11711,9 +11711,9 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB98_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB98_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -11731,8 +11731,8 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB98_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB98_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -11806,10 +11806,10 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB99_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB99_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB99_10
@@ -11914,9 +11914,9 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB99_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB99_12
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -11935,8 +11935,8 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fsd fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB99_9: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB99_15
 ; RV64ZVE32F-NEXT:  # %bb.10: # %else12
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -12031,9 +12031,9 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
-; RV32ZVE32F-NEXT:    bnez a2, .LBB100_9
+; RV32ZVE32F-NEXT:    andi a0, a1, 1
+; RV32ZVE32F-NEXT:    bnez a0, .LBB100_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
 ; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB100_10
@@ -12244,9 +12244,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB101_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB101_25
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -12261,9 +12261,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 4
 ; RV64ZVE32F-NEXT:    vse8.v v11, (a2)
 ; RV64ZVE32F-NEXT:  .LBB101_8: # %else8
-; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB101_10
 ; RV64ZVE32F-NEXT:  # %bb.9: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -12274,9 +12274,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 5
 ; RV64ZVE32F-NEXT:    vse8.v v11, (a2)
 ; RV64ZVE32F-NEXT:  .LBB101_10: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB101_27
 ; RV64ZVE32F-NEXT:  # %bb.11: # %else12
 ; RV64ZVE32F-NEXT:    andi a2, a1, 128
@@ -12298,9 +12298,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB101_15: # %else18
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB101_30
 ; RV64ZVE32F-NEXT:  # %bb.16: # %else20
 ; RV64ZVE32F-NEXT:    slli a2, a1, 52
@@ -12320,9 +12320,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 13
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
 ; RV64ZVE32F-NEXT:  .LBB101_20: # %else26
-; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB101_22
 ; RV64ZVE32F-NEXT:  # %bb.21: # %cond.store27
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
@@ -12443,11 +12443,11 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v8, v8, 16
 ; RV64V-NEXT:    vslidedown.vi v10, v10, 16
-; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64V-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64V-NEXT:    vsext.vf8 v16, v10
-; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64V-NEXT:    vslidedown.vi v0, v0, 2
+; RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
@@ -12476,9 +12476,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB102_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v10, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v10, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB102_49
 ; RV64ZVE32F-NEXT:  # %bb.5: # %else4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -12493,9 +12493,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 4
 ; RV64ZVE32F-NEXT:    vse8.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_8: # %else8
-; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v10, 8
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB102_10
 ; RV64ZVE32F-NEXT:  # %bb.9: # %cond.store9
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -12506,9 +12506,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v14, v8, 5
 ; RV64ZVE32F-NEXT:    vse8.v v14, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_10: # %else10
-; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v13, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 64
 ; RV64ZVE32F-NEXT:    bnez a2, .LBB102_51
 ; RV64ZVE32F-NEXT:  # %bb.11: # %else12
 ; RV64ZVE32F-NEXT:    andi a2, a1, 128
@@ -12530,9 +12530,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB102_15: # %else18
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v13, v12, 4
-; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v12, 2
+; RV64ZVE32F-NEXT:    andi a2, a1, 1024
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB102_17
 ; RV64ZVE32F-NEXT:  # %bb.16: # %cond.store19
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v12
@@ -12552,9 +12552,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v8, 11
 ; RV64ZVE32F-NEXT:    vse8.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_19: # %else22
-; RV64ZVE32F-NEXT:    slli a2, a1, 51
 ; RV64ZVE32F-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 16
+; RV64ZVE32F-NEXT:    slli a2, a1, 51
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB102_21
 ; RV64ZVE32F-NEXT:  # %bb.20: # %cond.store23
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v13
@@ -12574,9 +12574,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 13
 ; RV64ZVE32F-NEXT:    vse8.v v11, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_23: # %else26
-; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v13, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 49
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB102_54
 ; RV64ZVE32F-NEXT:  # %bb.24: # %else28
 ; RV64ZVE32F-NEXT:    slli a2, a1, 48
@@ -12599,9 +12599,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB102_28: # %else34
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    slli a2, a1, 45
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v12, v10, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 45
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB102_57
 ; RV64ZVE32F-NEXT:  # %bb.29: # %else36
 ; RV64ZVE32F-NEXT:    slli a2, a1, 44
@@ -12617,9 +12617,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse8.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_32: # %else40
-; RV64ZVE32F-NEXT:    slli a2, a1, 42
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 8
+; RV64ZVE32F-NEXT:    slli a2, a1, 42
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB102_34
 ; RV64ZVE32F-NEXT:  # %bb.33: # %cond.store41
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -12631,9 +12631,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse8.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_34: # %else42
-; RV64ZVE32F-NEXT:    slli a2, a1, 41
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v11, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 41
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB102_59
 ; RV64ZVE32F-NEXT:  # %bb.35: # %else44
 ; RV64ZVE32F-NEXT:    slli a2, a1, 40
@@ -12656,9 +12656,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB102_39: # %else50
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v10, 4
-; RV64ZVE32F-NEXT:    slli a2, a1, 37
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 37
 ; RV64ZVE32F-NEXT:    bltz a2, .LBB102_62
 ; RV64ZVE32F-NEXT:  # %bb.40: # %else52
 ; RV64ZVE32F-NEXT:    slli a2, a1, 36
@@ -12679,9 +12679,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vse8.v v12, (a2)
 ; RV64ZVE32F-NEXT:  .LBB102_44: # %else58
-; RV64ZVE32F-NEXT:    slli a2, a1, 33
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT:    slli a2, a1, 33
 ; RV64ZVE32F-NEXT:    bgez a2, .LBB102_46
 ; RV64ZVE32F-NEXT:  # %bb.45: # %cond.store59
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index ed6ec4d5659b1..6421d7c8022f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -242,9 +242,9 @@ define void @masked_store_v32f64(<32 x double> %val, ptr %a, <32 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -278,12 +278,12 @@ define void @masked_store_v64f32(<64 x float> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64f32.p0(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -294,12 +294,12 @@ define void @masked_store_v128bf16(<128 x bfloat> %val, ptr %a, <128 x i1> %mask
 ; CHECK-LABEL: masked_store_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128bf16.p0(<128 x bfloat> %val, ptr %a, i32 8, <128 x i1> %mask)
@@ -310,12 +310,12 @@ define void @masked_store_v128f16(<128 x half> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128f16.p0(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index c3b10db115bae..7a9fc0ecd8bb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -240,9 +240,9 @@ define void @masked_store_v32i64(<32 x i64> %val, ptr %a, <32 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -276,12 +276,12 @@ define void @masked_store_v64i32(<64 x i32> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 4
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64i32.p0(<64 x i32> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -303,12 +303,12 @@ define void @masked_store_v128i16(<128 x i16> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128i16.p0(<128 x i16> %val, ptr %a, i32 8, <128 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 80a9143d1ad8b..79b4f8852a4b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -9,19 +9,19 @@ declare <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -30,17 +30,17 @@ define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v2f16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
@@ -51,19 +51,19 @@ declare <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -72,17 +72,17 @@ define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v4f16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
@@ -93,19 +93,19 @@ declare <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -114,17 +114,17 @@ define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
@@ -137,19 +137,19 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -158,17 +158,17 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI7_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
@@ -183,15 +183,15 @@ define <2 x float> @vp_nearbyint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> %m, i32 %evl)
   ret <2 x float> %v
@@ -204,13 +204,13 @@ define <2 x float> @vp_nearbyint_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x float> %v
@@ -225,15 +225,15 @@ define <4 x float> @vp_nearbyint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
   ret <4 x float> %v
@@ -246,13 +246,13 @@ define <4 x float> @vp_nearbyint_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x float> %v
@@ -268,16 +268,16 @@ define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -290,13 +290,13 @@ define <8 x float> @vp_nearbyint_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x float> %v
@@ -312,16 +312,16 @@ define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -334,13 +334,13 @@ define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x float> %v
@@ -351,19 +351,19 @@ declare <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl)
   ret <2 x double> %v
@@ -372,17 +372,17 @@ define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe
 define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x double> %v
@@ -395,19 +395,19 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -416,17 +416,17 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x double> %v
@@ -439,19 +439,19 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -460,17 +460,17 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x double> %v
@@ -483,19 +483,19 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -504,17 +504,17 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x double> %v
@@ -527,19 +527,19 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -548,17 +548,17 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x double> %v
@@ -571,8 +571,8 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v6, v0
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -585,41 +585,41 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    frflags a2
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a1
+; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -664,9 +664,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index bf8baafc4a25d..ff6984eb82df1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -122,9 +122,9 @@ define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -140,9 +140,9 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v9, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -160,9 +160,9 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -182,9 +182,9 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -206,9 +206,9 @@ define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -232,9 +232,9 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
 ; CHECK-LABEL: reduce_sum_16xi32_prefix8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredsum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredsum.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -492,9 +492,9 @@ define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_xor_16xi32_prefix5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredxor.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -568,9 +568,9 @@ define i32 @reduce_or_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_or_16xi32_prefix5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredor.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredor.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -609,11 +609,11 @@ define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
 define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_smax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vredmax.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredmax.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -687,9 +687,9 @@ define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_umax_16xi32_prefix5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredmaxu.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vredmaxu.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -734,11 +734,11 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ;
 ; RV64-LABEL: reduce_umin_16xi32_prefix5:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, -1
-; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vredminu.vs v8, v8, v10
+; RV64-NEXT:    vmv.s.x v8, a1
+; RV64-NEXT:    vle32.v v10, (a0)
+; RV64-NEXT:    vredminu.vs v8, v10, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
   %v = load <16 x i32>, ptr %p, align 256
@@ -758,9 +758,9 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) {
 ; CHECK-LABEL: reduce_fadd_16xf32_prefix2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vfredusum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vfredusum.vs v8, v9, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %p, align 256
@@ -773,11 +773,11 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) {
 define float @reduce_fadd_16xi32_prefix5(ptr %p) {
 ; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vfredusum.vs v8, v8, v10
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vfredusum.vs v8, v10, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %p, align 256
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index 6684e6d223eac..c2cac3eeb7a46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -121,9 +121,9 @@ declare float @llvm.vp.reduce.fadd.v64f32(float, <64 x float>, <64 x i1>, i32)
 define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_fadd_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
@@ -149,9 +149,9 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
 define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_ord_fadd_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 2b279389253b0..23197ede1da49 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -253,11 +253,11 @@ define half @vreduce_ord_fadd_v128f16(ptr %x, half %s) {
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vle16.v v16, (a1)
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v16, (a0)
 ; CHECK-NEXT:    vfmv.s.f v24, fa0
-; CHECK-NEXT:    vfredosum.vs v8, v8, v24
-; CHECK-NEXT:    vfredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfredosum.vs v16, v16, v24
+; CHECK-NEXT:    vfredosum.vs v8, v8, v16
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <128 x half>, ptr %x
@@ -744,11 +744,11 @@ define float @vreduce_ord_fadd_v64f32(ptr %x, float %s) {
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    vfmv.s.f v24, fa0
-; CHECK-NEXT:    vfredosum.vs v8, v8, v24
-; CHECK-NEXT:    vfredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfredosum.vs v16, v16, v24
+; CHECK-NEXT:    vfredosum.vs v8, v8, v16
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
@@ -1135,11 +1135,11 @@ define double @vreduce_ord_fadd_v32f64(ptr %x, double %s) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    vfmv.s.f v24, fa0
-; CHECK-NEXT:    vfredosum.vs v8, v8, v24
-; CHECK-NEXT:    vfredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfredosum.vs v16, v16, v24
+; CHECK-NEXT:    vfredosum.vs v8, v8, v16
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
@@ -1344,17 +1344,17 @@ define float @vreduce_fmin_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmin_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v8, v24, v8
-; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v0, (a0)
+; CHECK-NEXT:    vfmin.vv v24, v0, v24
 ; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1591,17 +1591,17 @@ define float @vreduce_fmax_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmax_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v8, v24, v8
-; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v0, (a0)
+; CHECK-NEXT:    vfmax.vv v24, v0, v24
 ; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1997,59 +1997,56 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v24, v24, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB121_2
@@ -2077,17 +2074,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v8, v24, v8
-; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v0, (a0)
+; CHECK-NEXT:    vfmin.vv v24, v0, v24
 ; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2245,42 +2242,25 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
 define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB131_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI131_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI131_0)(a0)
-; CHECK-NEXT:    j .LBB131_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB131_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB131_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v)
@@ -2314,59 +2294,56 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
+; CHECK-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v24, v24, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB133_2
@@ -2395,15 +2372,15 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    vle64.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vfmin.vv v8, v8, v0
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v0, (a0)
+; CHECK-NEXT:    vfmin.vv v24, v0, v24
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2711,59 +2688,56 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v24, v24, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB149_2
@@ -2791,17 +2765,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a2, a0, 384
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
-; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v8, v24, v8
-; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vle32.v v0, (a0)
+; CHECK-NEXT:    vfmax.vv v24, v0, v24
 ; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2959,42 +2933,25 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
 define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB159_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI159_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI159_0)(a0)
-; CHECK-NEXT:    j .LBB159_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB159_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB159_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v)
@@ -3028,59 +2985,56 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
+; CHECK-NEXT:    vmerge.vvm v16, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v24, v24, v8
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB161_2
@@ -3109,15 +3063,15 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    vle64.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vfmax.vv v8, v8, v0
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v0, (a0)
+; CHECK-NEXT:    vfmax.vv v24, v0, v24
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index f920e39e7d295..8f61f314cf71b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -801,9 +801,9 @@ declare i32 @llvm.vp.reduce.xor.v64i32(i32, <64 x i32>, <64 x i1>, i32)
 define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_xor_v64i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB49_2
 ; CHECK-NEXT:  # %bb.1:
@@ -1575,10 +1575,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    vmsltu.vx v9, v10, a1
-; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 1
-; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT:    vmv.v.i v10, 1
+; RV32-NEXT:    vmand.mm v0, v9, v0
+; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV32-NEXT:    vslidedown.vi v9, v8, 4
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    vslidedown.vi v9, v8, 2
@@ -1606,10 +1606,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    vmsltu.vx v9, v10, a1
-; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.i v9, 1
-; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT:    vmv.v.i v10, 1
+; RV64-NEXT:    vmand.mm v0, v9, v0
+; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV64-NEXT:    vslidedown.vi v9, v8, 4
 ; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    vslidedown.vi v9, v8, 2
@@ -1643,10 +1643,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vid.v v12
 ; RV32-NEXT:    vmsltu.vx v9, v12, a1
-; RV32-NEXT:    vmand.mm v0, v9, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 1
-; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT:    vmv.v.i v10, 1
+; RV32-NEXT:    vmand.mm v0, v9, v0
+; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV32-NEXT:    vslidedown.vi v9, v8, 8
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    vslidedown.vi v9, v8, 4
@@ -1676,10 +1676,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m,
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vid.v v12
 ; RV64-NEXT:    vmsltu.vx v9, v12, a1
-; RV64-NEXT:    vmand.mm v0, v9, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v9, 1
-; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT:    vmv.v.i v10, 1
+; RV64-NEXT:    vmand.mm v0, v9, v0
+; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV64-NEXT:    vslidedown.vi v9, v8, 8
 ; RV64-NEXT:    vmul.vv v8, v8, v9
 ; RV64-NEXT:    vslidedown.vi v9, v8, 4
@@ -1716,10 +1716,10 @@ define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m,
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vid.v v16
 ; RV32-NEXT:    vmsltu.vx v10, v16, a1
-; RV32-NEXT:    vmand.mm v0, v10, v0
 ; RV32-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v10, 1
-; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV32-NEXT:    vmv.v.i v12, 1
+; RV32-NEXT:    vmand.mm v0, v10, v0
+; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; RV32-NEXT:    vslidedown.vi v10, v8, 16
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 8
@@ -1752,10 +1752,10 @@ define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m,
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    vmsltu.vx v10, v16, a1
-; RV64-NEXT:    vmand.mm v0, v10, v0
 ; RV64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; RV64-NEXT:    vmv.v.i v10, 1
-; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV64-NEXT:    vmv.v.i v12, 1
+; RV64-NEXT:    vmand.mm v0, v10, v0
+; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; RV64-NEXT:    vslidedown.vi v10, v8, 16
 ; RV64-NEXT:    vmul.vv v8, v8, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 8
@@ -1794,18 +1794,19 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    lui a3, %hi(.LCPI72_0)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI72_0)
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v12, (a3)
 ; RV32-NEXT:    vid.v v16
-; RV32-NEXT:    vmsltu.vx v14, v16, a1
-; RV32-NEXT:    li a3, 64
-; RV32-NEXT:    vsext.vf4 v16, v12
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
+; RV32-NEXT:    vle8.v v14, (a3)
+; RV32-NEXT:    li a3, 64
+; RV32-NEXT:    vsext.vf4 v16, v14
+; RV32-NEXT:    vmsltu.vx v13, v16, a1
+; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v16, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vi v14, v12, 4
+; RV32-NEXT:    vslideup.vi v12, v13, 4
 ; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
-; RV32-NEXT:    vmand.mm v0, v14, v0
-; RV32-NEXT:    vmv.v.i v12, 1
-; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
+; RV32-NEXT:    vmand.mm v0, v12, v0
+; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV32-NEXT:    vslidedown.vx v12, v8, a0
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 16
@@ -1840,18 +1841,19 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    lui a3, %hi(.LCPI72_0)
 ; RV64-NEXT:    addi a3, a3, %lo(.LCPI72_0)
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vle8.v v12, (a3)
 ; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vmsltu.vx v14, v16, a1
-; RV64-NEXT:    li a3, 64
-; RV64-NEXT:    vsext.vf4 v16, v12
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
+; RV64-NEXT:    vle8.v v14, (a3)
+; RV64-NEXT:    li a3, 64
+; RV64-NEXT:    vsext.vf4 v16, v14
+; RV64-NEXT:    vmsltu.vx v13, v16, a1
+; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
+; RV64-NEXT:    vmv.v.i v16, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vi v14, v12, 4
+; RV64-NEXT:    vslideup.vi v12, v13, 4
 ; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
-; RV64-NEXT:    vmand.mm v0, v14, v0
-; RV64-NEXT:    vmv.v.i v12, 1
-; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
+; RV64-NEXT:    vmand.mm v0, v12, v0
+; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV64-NEXT:    vslidedown.vx v12, v8, a0
 ; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 707d1202aca0f..c3c657c96c92a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -1471,14 +1471,14 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
 define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_add_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v24, (a1)
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vle64.v v0, (a1)
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 256
-; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vadd.vv v24, v0, v24
 ; RV32-NEXT:    vmv.s.x v7, zero
 ; RV32-NEXT:    li a1, 32
@@ -1495,15 +1495,15 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vadd.vv v16, v24, v16
-; RV64-NEXT:    vadd.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vadd.vv v24, v0, v24
 ; RV64-NEXT:    vadd.vv v8, v8, v16
+; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredsum.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1519,18 +1519,18 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a0)
-; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v16, 16
+; RV32-NEXT:    vslidedown.vi v0, v24, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v24, v16, v8
+; RV32-NEXT:    vwadd.vv v8, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v16, v0, v8
+; RV32-NEXT:    vwadd.vv v24, v0, v16
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v16
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1550,15 +1550,15 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    vle32.v v16, (a0)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vslidedown.vi v0, v16, 16
+; RV64-NEXT:    vslidedown.vi v0, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwadd.vv v24, v8, v16
+; RV64-NEXT:    vwadd.vv v24, v16, v8
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vwadd.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1585,18 +1585,18 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a0)
-; RV32-NEXT:    vle32.v v8, (a1)
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v16, 16
+; RV32-NEXT:    vslidedown.vi v0, v24, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v24, v16, v8
+; RV32-NEXT:    vwaddu.vv v8, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v16, v0, v8
+; RV32-NEXT:    vwaddu.vv v24, v0, v16
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v16
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1616,15 +1616,15 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vle32.v v8, (a1)
+; RV64-NEXT:    vle32.v v16, (a0)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vslidedown.vi v0, v16, 16
+; RV64-NEXT:    vslidedown.vi v0, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwaddu.vv v24, v8, v16
+; RV64-NEXT:    vwaddu.vv v24, v16, v8
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vwaddu.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -2201,16 +2201,16 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v24, v0, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2222,15 +2222,15 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vand.vv v16, v24, v16
-; RV64-NEXT:    vand.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vand.vv v24, v0, v24
 ; RV64-NEXT:    vand.vv v8, v8, v16
+; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vredand.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -2793,16 +2793,16 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vor.vv v24, v0, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2814,15 +2814,15 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vor.vv v24, v0, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vredor.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -3414,14 +3414,14 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
 define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_xor_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v24, (a1)
-; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vle64.v v0, (a1)
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 256
-; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vxor.vv v24, v0, v24
 ; RV32-NEXT:    vmv.s.x v7, zero
 ; RV32-NEXT:    li a1, 32
@@ -3438,15 +3438,15 @@ define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vxor.vv v16, v24, v16
-; RV64-NEXT:    vxor.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vxor.vv v24, v0, v24
 ; RV64-NEXT:    vxor.vv v8, v8, v16
+; RV64-NEXT:    vxor.vv v8, v8, v24
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredxor.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -4011,16 +4011,16 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vmin.vv v16, v0, v16
-; RV32-NEXT:    vmin.vv v8, v8, v24
+; RV32-NEXT:    vmin.vv v24, v0, v24
 ; RV32-NEXT:    vmin.vv v8, v8, v16
+; RV32-NEXT:    vmin.vv v8, v8, v24
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -4032,15 +4032,15 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vmin.vv v16, v24, v16
-; RV64-NEXT:    vmin.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vmin.vv v24, v0, v24
 ; RV64-NEXT:    vmin.vv v8, v8, v16
+; RV64-NEXT:    vmin.vv v8, v8, v24
 ; RV64-NEXT:    vredmin.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -4604,16 +4604,16 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vmax.vv v16, v0, v16
-; RV32-NEXT:    vmax.vv v8, v8, v24
+; RV32-NEXT:    vmax.vv v24, v0, v24
 ; RV32-NEXT:    vmax.vv v8, v8, v16
+; RV32-NEXT:    vmax.vv v8, v8, v24
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -4625,15 +4625,15 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vmax.vv v16, v24, v16
-; RV64-NEXT:    vmax.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vmax.vv v24, v0, v24
 ; RV64-NEXT:    vmax.vv v8, v8, v16
+; RV64-NEXT:    vmax.vv v8, v8, v24
 ; RV64-NEXT:    vredmax.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -5197,16 +5197,16 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vminu.vv v16, v0, v16
-; RV32-NEXT:    vminu.vv v8, v8, v24
+; RV32-NEXT:    vminu.vv v24, v0, v24
 ; RV32-NEXT:    vminu.vv v8, v8, v16
+; RV32-NEXT:    vminu.vv v8, v8, v24
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -5218,15 +5218,15 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vminu.vv v16, v24, v16
-; RV64-NEXT:    vminu.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vminu.vv v24, v0, v24
 ; RV64-NEXT:    vminu.vv v8, v8, v16
+; RV64-NEXT:    vminu.vv v8, v8, v24
 ; RV64-NEXT:    vredminu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -5789,16 +5789,16 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vmaxu.vv v16, v0, v16
-; RV32-NEXT:    vmaxu.vv v8, v8, v24
+; RV32-NEXT:    vmaxu.vv v24, v0, v24
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
+; RV32-NEXT:    vmaxu.vv v8, v8, v24
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -5810,15 +5810,15 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vmaxu.vv v16, v24, v16
-; RV64-NEXT:    vmaxu.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vmaxu.vv v24, v0, v24
 ; RV64-NEXT:    vmaxu.vv v8, v8, v16
+; RV64-NEXT:    vmaxu.vv v8, v8, v24
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    ret
@@ -6585,15 +6585,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vle64.v v16, (a1)
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmul.vv v16, v24, v16
-; RV32-NEXT:    vmul.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vmul.vv v24, v0, v24
 ; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
@@ -6612,15 +6612,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    addi a1, a0, 384
-; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    addi a1, a0, 384
 ; RV64-NEXT:    addi a0, a0, 128
-; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    vle64.v v0, (a1)
-; RV64-NEXT:    vmul.vv v16, v24, v16
-; RV64-NEXT:    vmul.vv v8, v8, v0
+; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v0, (a0)
+; RV64-NEXT:    vmul.vv v24, v0, v24
 ; RV64-NEXT:    vmul.vv v8, v8, v16
+; RV64-NEXT:    vmul.vv v8, v8, v24
 ; RV64-NEXT:    vslidedown.vi v16, v8, 8
 ; RV64-NEXT:    vmul.vv v8, v8, v16
 ; RV64-NEXT:    vslidedown.vi v16, v8, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 266772d36ee9c..70555bd6c09e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -519,8 +519,8 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v6, v0
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -542,11 +542,11 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index a4ff079846fd8..d35637401dd66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.round.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -35,12 +35,12 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -59,12 +59,12 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -77,11 +77,11 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.round.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -121,12 +121,12 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -145,12 +145,12 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,11 +163,11 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.round.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -207,12 +207,12 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -231,12 +231,12 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -249,11 +249,11 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -273,12 +273,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -295,12 +295,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -319,12 +319,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -337,11 +337,11 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,9 +363,9 @@ define <2 x float> @vp_round_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -384,8 +384,8 @@ define <2 x float> @vp_round_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -405,9 +405,9 @@ define <4 x float> @vp_round_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -426,8 +426,8 @@ define <4 x float> @vp_round_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,9 +448,9 @@ define <8 x float> @vp_round_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -470,8 +470,8 @@ define <8 x float> @vp_round_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -492,9 +492,9 @@ define <16 x float> @vp_round_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -514,8 +514,8 @@ define <16 x float> @vp_round_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.round.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -552,12 +552,12 @@ define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %
 define <2 x double> @vp_round_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -575,12 +575,12 @@ define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -596,12 +596,12 @@ define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %
 define <4 x double> @vp_round_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -619,12 +619,12 @@ define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -640,12 +640,12 @@ define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %
 define <8 x double> @vp_round_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -663,12 +663,12 @@ define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -684,12 +684,12 @@ define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe
 define <15 x double> @vp_round_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -707,12 +707,12 @@ define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -728,12 +728,12 @@ define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe
 define <16 x double> @vp_round_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -762,8 +762,8 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -778,33 +778,33 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index c28d5fb1a8193..addb76b0bea7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.roundeven.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -35,12 +35,12 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -59,12 +59,12 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -77,11 +77,11 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.roundeven.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -121,12 +121,12 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -145,12 +145,12 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,11 +163,11 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.roundeven.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -207,12 +207,12 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -231,12 +231,12 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -249,11 +249,11 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -273,12 +273,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -295,12 +295,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -319,12 +319,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -337,11 +337,11 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,9 +363,9 @@ define <2 x float> @vp_roundeven_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -384,8 +384,8 @@ define <2 x float> @vp_roundeven_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -405,9 +405,9 @@ define <4 x float> @vp_roundeven_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -426,8 +426,8 @@ define <4 x float> @vp_roundeven_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,9 +448,9 @@ define <8 x float> @vp_roundeven_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -470,8 +470,8 @@ define <8 x float> @vp_roundeven_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -492,9 +492,9 @@ define <16 x float> @vp_roundeven_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -514,8 +514,8 @@ define <16 x float> @vp_roundeven_v16f32_unmasked(<16 x float> %va, i32 zeroext
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.roundeven.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -552,12 +552,12 @@ define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe
 define <2 x double> @vp_roundeven_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -575,12 +575,12 @@ define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -596,12 +596,12 @@ define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 define <4 x double> @vp_roundeven_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -619,12 +619,12 @@ define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -640,12 +640,12 @@ define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 define <8 x double> @vp_roundeven_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -663,12 +663,12 @@ define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -684,12 +684,12 @@ define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 define <15 x double> @vp_roundeven_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -707,12 +707,12 @@ define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -728,12 +728,12 @@ define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 define <16 x double> @vp_roundeven_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -762,8 +762,8 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -778,33 +778,33 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 64d3664a4c372..bac25bcfec01d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half>, <2 x i1>, i32)
 define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -35,12 +35,12 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -59,12 +59,12 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -77,11 +77,11 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half>, <4 x i1>, i32)
 define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -121,12 +121,12 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -145,12 +145,12 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,11 +163,11 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half>, <8 x i1>, i32)
 define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -207,12 +207,12 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -231,12 +231,12 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -249,11 +249,11 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -273,12 +273,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -295,12 +295,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -319,12 +319,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -337,11 +337,11 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,9 +363,9 @@ define <2 x float> @vp_roundtozero_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -384,8 +384,8 @@ define <2 x float> @vp_roundtozero_v2f32_unmasked(<2 x float> %va, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -405,9 +405,9 @@ define <4 x float> @vp_roundtozero_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -426,8 +426,8 @@ define <4 x float> @vp_roundtozero_v4f32_unmasked(<4 x float> %va, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,9 +448,9 @@ define <8 x float> @vp_roundtozero_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -470,8 +470,8 @@ define <8 x float> @vp_roundtozero_v8f32_unmasked(<8 x float> %va, i32 zeroext %
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -492,9 +492,9 @@ define <16 x float> @vp_roundtozero_v16f32(<16 x float> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -514,8 +514,8 @@ define <16 x float> @vp_roundtozero_v16f32_unmasked(<16 x float> %va, i32 zeroex
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double>, <2 x i1>, i32)
 define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -552,12 +552,12 @@ define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zer
 define <2 x double> @vp_roundtozero_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -575,12 +575,12 @@ define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -596,12 +596,12 @@ define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zer
 define <4 x double> @vp_roundtozero_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -619,12 +619,12 @@ define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -640,12 +640,12 @@ define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zer
 define <8 x double> @vp_roundtozero_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI21_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -663,12 +663,12 @@ define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -684,12 +684,12 @@ define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32
 define <15 x double> @vp_roundtozero_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v15f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -707,12 +707,12 @@ define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -728,12 +728,12 @@ define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32
 define <16 x double> @vp_roundtozero_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI25_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI25_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -762,8 +762,8 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
@@ -778,33 +778,33 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
index 318f38839851c..034a969fc2847 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -141,36 +141,18 @@ define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32>
 define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v64i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v24, (a0)
+; CHECK-NEXT:    vrsub.vi v24, v24, 0, v0.t
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vadd.vv v8, v8, v24
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vrsub.vi v24, v24, 0, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vadd.vv v16, v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %sub = sub <64 x i32> %a, %b
   %add = add <64 x i32> %a, %b
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 03d5762b4903e..13242fc8f0d66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1073,19 +1073,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    addi a1, a0, 128
 ; ZVFH-NEXT:    li a3, 64
+; ZVFH-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v24, v0, 8
 ; ZVFH-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; ZVFH-NEXT:    vle16.v v16, (a1)
 ; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    mv a1, a2
 ; ZVFH-NEXT:    vle16.v v16, (a0)
-; ZVFH-NEXT:    mv a0, a2
-; ZVFH-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vi v24, v0, 8
 ; ZVFH-NEXT:    bltu a2, a3, .LBB43_2
 ; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    li a0, 64
+; ZVFH-NEXT:    li a1, 64
 ; ZVFH-NEXT:  .LBB43_2:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; ZVFH-NEXT:    addi a0, a2, -64
 ; ZVFH-NEXT:    sltu a1, a2, a0
@@ -1114,20 +1114,32 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ;
 ; ZVFHMIN32-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    addi sp, sp, -896
-; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 896
-; ZVFHMIN32-NEXT:    sw ra, 892(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s0, 888(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s2, 884(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s3, 880(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s4, 876(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s5, 872(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s6, 868(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s7, 864(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s8, 860(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s9, 856(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s10, 852(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s11, 848(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    addi sp, sp, -1024
+; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 1024
+; ZVFHMIN32-NEXT:    sw ra, 1020(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s0, 1016(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s2, 1012(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s3, 1008(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s4, 1004(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s5, 1000(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s6, 996(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s7, 992(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s8, 988(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s9, 984(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s10, 980(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s11, 976(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs0, 968(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs1, 960(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs2, 952(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs3, 944(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs4, 936(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs5, 928(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs6, 920(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs7, 912(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs8, 904(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs9, 896(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs10, 888(sp) # 8-byte Folded Spill
+; ZVFHMIN32-NEXT:    fsd fs11, 880(sp) # 8-byte Folded Spill
 ; ZVFHMIN32-NEXT:    .cfi_offset ra, -4
 ; ZVFHMIN32-NEXT:    .cfi_offset s0, -8
 ; ZVFHMIN32-NEXT:    .cfi_offset s2, -12
@@ -1140,1096 +1152,1175 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    .cfi_offset s9, -40
 ; ZVFHMIN32-NEXT:    .cfi_offset s10, -44
 ; ZVFHMIN32-NEXT:    .cfi_offset s11, -48
-; ZVFHMIN32-NEXT:    addi s0, sp, 896
+; ZVFHMIN32-NEXT:    .cfi_offset fs0, -56
+; ZVFHMIN32-NEXT:    .cfi_offset fs1, -64
+; ZVFHMIN32-NEXT:    .cfi_offset fs2, -72
+; ZVFHMIN32-NEXT:    .cfi_offset fs3, -80
+; ZVFHMIN32-NEXT:    .cfi_offset fs4, -88
+; ZVFHMIN32-NEXT:    .cfi_offset fs5, -96
+; ZVFHMIN32-NEXT:    .cfi_offset fs6, -104
+; ZVFHMIN32-NEXT:    .cfi_offset fs7, -112
+; ZVFHMIN32-NEXT:    .cfi_offset fs8, -120
+; ZVFHMIN32-NEXT:    .cfi_offset fs9, -128
+; ZVFHMIN32-NEXT:    .cfi_offset fs10, -136
+; ZVFHMIN32-NEXT:    .cfi_offset fs11, -144
+; ZVFHMIN32-NEXT:    addi s0, sp, 1024
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN32-NEXT:    csrr a1, vlenb
-; ZVFHMIN32-NEXT:    li a2, 30
+; ZVFHMIN32-NEXT:    li a2, 41
 ; ZVFHMIN32-NEXT:    mul a1, a1, a2
 ; ZVFHMIN32-NEXT:    sub sp, sp, a1
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
-; ZVFHMIN32-NEXT:    addi a1, a0, 128
-; ZVFHMIN32-NEXT:    li a2, 64
-; ZVFHMIN32-NEXT:    addi a3, sp, 640
-; ZVFHMIN32-NEXT:    addi a4, sp, 384
-; ZVFHMIN32-NEXT:    addi a5, sp, 512
-; ZVFHMIN32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; ZVFHMIN32-NEXT:    addi a3, a0, 128
+; ZVFHMIN32-NEXT:    li a1, 64
+; ZVFHMIN32-NEXT:    addi a4, sp, 640
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vle16.v v24, (a3)
+; ZVFHMIN32-NEXT:    csrr a3, vlenb
+; ZVFHMIN32-NEXT:    slli a5, a3, 5
+; ZVFHMIN32-NEXT:    add a3, a5, a3
+; ZVFHMIN32-NEXT:    add a3, sp, a3
+; ZVFHMIN32-NEXT:    addi a3, a3, 880
+; ZVFHMIN32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 256
-; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
-; ZVFHMIN32-NEXT:    vse16.v v8, (a3)
-; ZVFHMIN32-NEXT:    vse16.v v0, (a4)
-; ZVFHMIN32-NEXT:    vse16.v v16, (a5)
-; ZVFHMIN32-NEXT:    vse16.v v24, (a0)
-; ZVFHMIN32-NEXT:    lh a0, 704(sp)
+; ZVFHMIN32-NEXT:    vse16.v v8, (a4)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a0, a0, 5
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 5
+; ZVFHMIN32-NEXT:    sub a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 30
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 29
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 28
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 15
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 14
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 11
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 3
+; ZVFHMIN32-NEXT:    add a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 10
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 3
+; ZVFHMIN32-NEXT:    sub a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 9
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 2
+; ZVFHMIN32-NEXT:    add a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 8
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 1
+; ZVFHMIN32-NEXT:    add a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    addi a0, sp, 384
+; ZVFHMIN32-NEXT:    addi a3, sp, 512
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v16
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN32-NEXT:    vse16.v v0, (a0)
+; ZVFHMIN32-NEXT:    vse16.v v16, (a3)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 7
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 11
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 6
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 5
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 13
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 4
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 4
+; ZVFHMIN32-NEXT:    sub a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a0, a0, 4
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 4
+; ZVFHMIN32-NEXT:    add a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 15
+; ZVFHMIN32-NEXT:    addi a0, sp, 880
+; ZVFHMIN32-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v24, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v16, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v16, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v16, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v0
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v17, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v23, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v19, v0, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v0, 1
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a0, a0, 1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v0, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v0, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v0, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v26, v0, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v0, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v0, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a0, a0, a3
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v6
+; ZVFHMIN32-NEXT:    csrr a3, vlenb
+; ZVFHMIN32-NEXT:    li a4, 22
+; ZVFHMIN32-NEXT:    mul a3, a3, a4
+; ZVFHMIN32-NEXT:    add a3, sp, a3
+; ZVFHMIN32-NEXT:    addi a3, a3, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    li a7, 20
+; ZVFHMIN32-NEXT:    mul a4, a4, a7
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    li t0, 18
+; ZVFHMIN32-NEXT:    mul a4, a4, t0
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    slli t0, a4, 3
+; ZVFHMIN32-NEXT:    add a4, t0, a4
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    slli t0, a4, 3
+; ZVFHMIN32-NEXT:    sub a4, t0, a4
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    slli t0, a4, 2
+; ZVFHMIN32-NEXT:    add a4, t0, a4
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v6
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    slli t0, a4, 1
+; ZVFHMIN32-NEXT:    add a4, t0, a4
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v6
+; ZVFHMIN32-NEXT:    addi a4, sp, 880
+; ZVFHMIN32-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v6
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v4
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v2
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v24
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v22
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v20
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v18
+; ZVFHMIN32-NEXT:    sw t0, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v16
+; ZVFHMIN32-NEXT:    sw t0, 124(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v12
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v14
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v28
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v30
+; ZVFHMIN32-NEXT:    fmv.h.x fs8, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fs7, a5
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs6, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a5, a2, 5
+; ZVFHMIN32-NEXT:    sub a2, a5, a2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs5, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 30
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft10, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 29
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft8, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 28
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 27
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 26
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 11
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft5, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x ft6, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 13
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa6, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a5, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a5
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs0, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a5, a2, 4
+; ZVFHMIN32-NEXT:    sub a2, a5, a2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs1, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs2, a2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a5, a2, 4
+; ZVFHMIN32-NEXT:    add a2, a5, a2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh a2, 880(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fs3, a2
+; ZVFHMIN32-NEXT:    addi a2, sp, 256
+; ZVFHMIN32-NEXT:    fmv.h.x fs4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x ft7, a3
+; ZVFHMIN32-NEXT:    fmv.h.x ft11, a7
+; ZVFHMIN32-NEXT:    fmv.h.x ft9, s3
+; ZVFHMIN32-NEXT:    fmv.h.x fa7, s10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN32-NEXT:    fsh fa5, 114(sp) # 2-byte Folded Spill
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a0, 5
+; ZVFHMIN32-NEXT:    add a0, a3, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    addi a0, a0, 880
+; ZVFHMIN32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN32-NEXT:    vse16.v v24, (a2)
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v0
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s5
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v5
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s7
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v17
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, s9
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v23
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, s8
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v19
+; ZVFHMIN32-NEXT:    fmv.h.x ft0, s6
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v21
+; ZVFHMIN32-NEXT:    fmv.h.x ft1, s4
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    fsh fa5, 112(sp) # 2-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fs9, a6
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a0, a0, 1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh a6, 880(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    fmv.h.x fs10, s2
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    fmv.h.x fs11, s5
+; ZVFHMIN32-NEXT:    feq.h s2, fs8, fs9
+; ZVFHMIN32-NEXT:    fmv.h.x fs8, s7
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    fmv.h.x fs9, s9
+; ZVFHMIN32-NEXT:    feq.h s11, fs7, fs10
+; ZVFHMIN32-NEXT:    fmv.h.x fs7, s8
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    fmv.h.x fs10, s6
+; ZVFHMIN32-NEXT:    feq.h s4, fs6, fs11
+; ZVFHMIN32-NEXT:    fmv.h.x fs6, s10
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    fmv.h.x fs11, a6
+; ZVFHMIN32-NEXT:    feq.h s5, fs5, fs8
+; ZVFHMIN32-NEXT:    fmv.h.x fs5, a0
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    fmv.h.x fs8, s7
+; ZVFHMIN32-NEXT:    feq.h s6, ft10, fs9
+; ZVFHMIN32-NEXT:    fmv.h.x fs9, s8
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    feq.h s7, ft8, fs7
+; ZVFHMIN32-NEXT:    fmv.h.x fs7, a0
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN32-NEXT:    feq.h s8, ft2, fs10
+; ZVFHMIN32-NEXT:    fmv.h.x fs10, a0
+; ZVFHMIN32-NEXT:    feq.h s9, ft3, fs6
+; ZVFHMIN32-NEXT:    fmv.h.x fs6, t6
+; ZVFHMIN32-NEXT:    feq.h s10, ft4, fs11
+; ZVFHMIN32-NEXT:    fmv.h.x fs11, t0
+; ZVFHMIN32-NEXT:    feq.h t0, ft5, fs5
+; ZVFHMIN32-NEXT:    fmv.h.x fs5, t1
+; ZVFHMIN32-NEXT:    feq.h t1, ft6, fs8
+; ZVFHMIN32-NEXT:    fmv.h.x ft10, t2
+; ZVFHMIN32-NEXT:    feq.h t2, fa6, fs9
+; ZVFHMIN32-NEXT:    fmv.h.x ft8, t3
+; ZVFHMIN32-NEXT:    feq.h t3, fs0, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x ft2, t4
+; ZVFHMIN32-NEXT:    feq.h t4, fs1, fs7
+; ZVFHMIN32-NEXT:    fmv.h.x ft3, t5
+; ZVFHMIN32-NEXT:    feq.h t5, fs2, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x ft4, a3
+; ZVFHMIN32-NEXT:    feq.h t6, fs3, fs10
+; ZVFHMIN32-NEXT:    fmv.h.x ft5, a5
+; ZVFHMIN32-NEXT:    feq.h a0, fs4, fs6
+; ZVFHMIN32-NEXT:    fmv.h.x ft6, ra
+; ZVFHMIN32-NEXT:    feq.h a5, ft7, fs11
+; ZVFHMIN32-NEXT:    fmv.h.x ft7, a2
+; ZVFHMIN32-NEXT:    lh a2, 704(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa6, a1
+; ZVFHMIN32-NEXT:    feq.h a6, ft11, fs5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN32-NEXT:    lh a1, 448(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 160(sp)
-; ZVFHMIN32-NEXT:    lh a0, 702(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 160(sp)
+; ZVFHMIN32-NEXT:    lh a1, 702(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 446(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 159(sp)
-; ZVFHMIN32-NEXT:    lh a0, 700(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 159(sp)
+; ZVFHMIN32-NEXT:    lh a1, 700(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 444(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 158(sp)
-; ZVFHMIN32-NEXT:    lh a0, 698(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 158(sp)
+; ZVFHMIN32-NEXT:    lh a1, 698(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 442(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 157(sp)
-; ZVFHMIN32-NEXT:    lh a0, 696(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 157(sp)
+; ZVFHMIN32-NEXT:    lh a1, 696(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 440(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 156(sp)
-; ZVFHMIN32-NEXT:    lh a0, 694(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 156(sp)
+; ZVFHMIN32-NEXT:    lh a1, 694(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 438(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 155(sp)
-; ZVFHMIN32-NEXT:    lh a0, 692(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 155(sp)
+; ZVFHMIN32-NEXT:    lh a1, 692(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 436(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 154(sp)
-; ZVFHMIN32-NEXT:    lh a0, 690(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 154(sp)
+; ZVFHMIN32-NEXT:    lh a1, 690(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 434(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 153(sp)
-; ZVFHMIN32-NEXT:    lh a0, 688(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 153(sp)
+; ZVFHMIN32-NEXT:    lh a1, 688(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 432(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 152(sp)
-; ZVFHMIN32-NEXT:    lh a0, 686(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 152(sp)
+; ZVFHMIN32-NEXT:    lh a1, 686(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 430(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 151(sp)
-; ZVFHMIN32-NEXT:    lh a0, 684(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 151(sp)
+; ZVFHMIN32-NEXT:    lh a1, 684(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 428(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 150(sp)
-; ZVFHMIN32-NEXT:    lh a0, 682(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 150(sp)
+; ZVFHMIN32-NEXT:    lh a1, 682(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 426(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 149(sp)
-; ZVFHMIN32-NEXT:    lh a0, 680(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 149(sp)
+; ZVFHMIN32-NEXT:    lh a1, 680(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 424(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 148(sp)
-; ZVFHMIN32-NEXT:    lh a0, 678(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 148(sp)
+; ZVFHMIN32-NEXT:    lh a1, 678(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 422(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 147(sp)
-; ZVFHMIN32-NEXT:    lh a0, 676(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 147(sp)
+; ZVFHMIN32-NEXT:    lh a1, 676(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 420(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 146(sp)
-; ZVFHMIN32-NEXT:    lh a0, 674(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 146(sp)
+; ZVFHMIN32-NEXT:    lh a1, 674(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 418(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT:    sb a0, 145(sp)
-; ZVFHMIN32-NEXT:    lh a0, 672(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 145(sp)
+; ZVFHMIN32-NEXT:    lh a1, 672(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 416(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 128(sp)
-; ZVFHMIN32-NEXT:    sb a0, 144(sp)
-; ZVFHMIN32-NEXT:    lh a0, 576(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s2, 128(sp)
+; ZVFHMIN32-NEXT:    feq.h s2, ft9, ft10
+; ZVFHMIN32-NEXT:    sb a1, 144(sp)
+; ZVFHMIN32-NEXT:    lh a1, 576(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 320(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 224(sp)
-; ZVFHMIN32-NEXT:    lh a0, 574(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 224(sp)
+; ZVFHMIN32-NEXT:    lh a1, 574(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 318(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 223(sp)
-; ZVFHMIN32-NEXT:    lh a0, 572(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 223(sp)
+; ZVFHMIN32-NEXT:    lh a1, 572(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 316(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 222(sp)
-; ZVFHMIN32-NEXT:    lh a0, 570(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 222(sp)
+; ZVFHMIN32-NEXT:    lh a1, 570(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 314(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 221(sp)
-; ZVFHMIN32-NEXT:    lh a0, 568(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 221(sp)
+; ZVFHMIN32-NEXT:    lh a1, 568(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 312(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 220(sp)
-; ZVFHMIN32-NEXT:    lh a0, 566(sp)
-; ZVFHMIN32-NEXT:    lh a1, 310(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 220(sp)
+; ZVFHMIN32-NEXT:    lh a1, 566(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 310(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 219(sp)
-; ZVFHMIN32-NEXT:    lh a0, 564(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 219(sp)
+; ZVFHMIN32-NEXT:    lh a1, 564(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 308(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 218(sp)
-; ZVFHMIN32-NEXT:    lh a0, 562(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 218(sp)
+; ZVFHMIN32-NEXT:    lh a1, 562(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 306(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 29
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 28
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 27
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 26
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 25
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 24
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 23
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v16
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 217(sp)
-; ZVFHMIN32-NEXT:    lh a0, 560(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 217(sp)
+; ZVFHMIN32-NEXT:    lh a1, 560(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
-; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 216(sp)
-; ZVFHMIN32-NEXT:    lh a0, 558(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 216(sp)
+; ZVFHMIN32-NEXT:    lh a1, 558(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 2
-; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 6
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 10
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 4
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 215(sp)
-; ZVFHMIN32-NEXT:    lh a0, 556(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 215(sp)
+; ZVFHMIN32-NEXT:    lh a1, 556(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 214(sp)
-; ZVFHMIN32-NEXT:    lh a0, 554(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 214(sp)
+; ZVFHMIN32-NEXT:    lh a1, 554(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 213(sp)
-; ZVFHMIN32-NEXT:    lh a0, 552(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 213(sp)
+; ZVFHMIN32-NEXT:    lh a1, 552(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 212(sp)
-; ZVFHMIN32-NEXT:    lh a0, 550(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 212(sp)
+; ZVFHMIN32-NEXT:    lh a1, 550(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v22
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 211(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 211(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 548(sp)
-; ZVFHMIN32-NEXT:    lh t5, 292(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v14
-; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    lh a1, 292(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a1, 210(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 546(sp)
-; ZVFHMIN32-NEXT:    lh t5, 290(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 290(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a1, 209(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 544(sp)
-; ZVFHMIN32-NEXT:    lh t5, 288(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    lh a1, 288(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 192(sp)
+; ZVFHMIN32-NEXT:    sb s11, 192(sp)
+; ZVFHMIN32-NEXT:    feq.h s11, fa7, ft8
 ; ZVFHMIN32-NEXT:    sb a1, 208(sp)
-; ZVFHMIN32-NEXT:    lh t5, 738(sp)
-; ZVFHMIN32-NEXT:    lh t6, 482(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
-; ZVFHMIN32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 177(sp)
-; ZVFHMIN32-NEXT:    lh t5, 736(sp)
-; ZVFHMIN32-NEXT:    lh t6, 480(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 29
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 28
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 176(sp)
-; ZVFHMIN32-NEXT:    lh t5, 734(sp)
-; ZVFHMIN32-NEXT:    lh t6, 478(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 27
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 26
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 175(sp)
-; ZVFHMIN32-NEXT:    lh t5, 732(sp)
-; ZVFHMIN32-NEXT:    lh t6, 476(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 25
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 24
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t5, 174(sp)
-; ZVFHMIN32-NEXT:    lh t6, 730(sp)
-; ZVFHMIN32-NEXT:    lh s9, 474(sp)
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 23
-; ZVFHMIN32-NEXT:    mul a0, a0, a1
-; ZVFHMIN32-NEXT:    add a0, sp, a0
-; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t6, 173(sp)
-; ZVFHMIN32-NEXT:    lh s9, 728(sp)
-; ZVFHMIN32-NEXT:    lh s10, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v13
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s9, 172(sp)
-; ZVFHMIN32-NEXT:    lh s9, 726(sp)
-; ZVFHMIN32-NEXT:    lh s10, 470(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v11
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s9, 171(sp)
-; ZVFHMIN32-NEXT:    lh s10, 724(sp)
-; ZVFHMIN32-NEXT:    lh s11, 468(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v9
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN32-NEXT:    feq.h s10, fa5, fa4
-; ZVFHMIN32-NEXT:    sb s10, 170(sp)
-; ZVFHMIN32-NEXT:    lh a0, 722(sp)
+; ZVFHMIN32-NEXT:    lh a1, 738(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 482(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 177(sp)
+; ZVFHMIN32-NEXT:    lh a1, 736(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 480(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 176(sp)
+; ZVFHMIN32-NEXT:    lh a1, 734(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 478(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 175(sp)
+; ZVFHMIN32-NEXT:    lh a1, 732(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 476(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 174(sp)
+; ZVFHMIN32-NEXT:    lh a1, 730(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 474(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 173(sp)
+; ZVFHMIN32-NEXT:    lh a1, 728(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 472(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 172(sp)
+; ZVFHMIN32-NEXT:    lh a1, 726(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 470(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 171(sp)
+; ZVFHMIN32-NEXT:    lh a1, 724(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 468(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 170(sp)
+; ZVFHMIN32-NEXT:    lh a1, 722(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v27
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 169(sp)
-; ZVFHMIN32-NEXT:    lh a0, 720(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 169(sp)
+; ZVFHMIN32-NEXT:    lh a1, 720(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 464(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 168(sp)
-; ZVFHMIN32-NEXT:    lh a0, 718(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 168(sp)
+; ZVFHMIN32-NEXT:    lh a1, 718(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 462(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s7
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s8
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
-; ZVFHMIN32-NEXT:    fmv.h.x fa1, ra
-; ZVFHMIN32-NEXT:    sb a0, 167(sp)
-; ZVFHMIN32-NEXT:    lh a0, 716(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 167(sp)
+; ZVFHMIN32-NEXT:    lh a1, 716(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 460(sp)
-; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa1
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
 ; ZVFHMIN32-NEXT:    sb a1, 166(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 714(sp)
-; ZVFHMIN32-NEXT:    lh a2, 458(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 458(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a1, 165(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 712(sp)
-; ZVFHMIN32-NEXT:    lh a2, 456(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN32-NEXT:    feq.h a4, fa2, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a1, 164(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 710(sp)
-; ZVFHMIN32-NEXT:    lh a2, 454(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s9
-; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a1, 163(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 708(sp)
-; ZVFHMIN32-NEXT:    lh a2, 452(sp)
-; ZVFHMIN32-NEXT:    feq.h s3, fa4, fa5
-; ZVFHMIN32-NEXT:    feq.h s4, fa3, fa2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    lh a1, 452(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
 ; ZVFHMIN32-NEXT:    sb a1, 162(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 706(sp)
 ; ZVFHMIN32-NEXT:    lh a2, 450(sp)
-; ZVFHMIN32-NEXT:    sb s4, 129(sp)
-; ZVFHMIN32-NEXT:    sb s3, 130(sp)
-; ZVFHMIN32-NEXT:    sb s2, 131(sp)
-; ZVFHMIN32-NEXT:    sb a4, 132(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 133(sp)
-; ZVFHMIN32-NEXT:    sb a0, 134(sp)
-; ZVFHMIN32-NEXT:    sb s5, 135(sp)
+; ZVFHMIN32-NEXT:    sb s10, 129(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 114(sp) # 2-byte Folded Reload
+; ZVFHMIN32-NEXT:    feq.h s10, fa4, ft2
+; ZVFHMIN32-NEXT:    sb s9, 130(sp)
+; ZVFHMIN32-NEXT:    feq.h s9, fa3, ft3
+; ZVFHMIN32-NEXT:    sb s8, 131(sp)
+; ZVFHMIN32-NEXT:    feq.h ra, fa2, ft4
+; ZVFHMIN32-NEXT:    sb s7, 132(sp)
+; ZVFHMIN32-NEXT:    feq.h s3, fa1, ft5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h s7, fa0, ft6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN32-NEXT:    feq.h s8, ft0, ft7
+; ZVFHMIN32-NEXT:    sb s6, 133(sp)
+; ZVFHMIN32-NEXT:    feq.h s6, ft1, fa6
+; ZVFHMIN32-NEXT:    sb s5, 134(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    sb s4, 135(sp)
+; ZVFHMIN32-NEXT:    flh fa4, 112(sp) # 2-byte Folded Reload
+; ZVFHMIN32-NEXT:    feq.h s4, fa4, fa5
 ; ZVFHMIN32-NEXT:    sb a1, 161(sp)
-; ZVFHMIN32-NEXT:    lh a0, 610(sp)
+; ZVFHMIN32-NEXT:    lh a1, 610(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v5
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 241(sp)
-; ZVFHMIN32-NEXT:    lh a0, 608(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 241(sp)
+; ZVFHMIN32-NEXT:    lh a1, 608(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 240(sp)
-; ZVFHMIN32-NEXT:    lh a0, 606(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 240(sp)
+; ZVFHMIN32-NEXT:    lh a1, 606(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN32-NEXT:    sb a0, 239(sp)
-; ZVFHMIN32-NEXT:    lh a0, 604(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 239(sp)
+; ZVFHMIN32-NEXT:    lh a1, 604(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 348(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 238(sp)
-; ZVFHMIN32-NEXT:    lh a0, 602(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 238(sp)
+; ZVFHMIN32-NEXT:    lh a1, 602(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 346(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 237(sp)
-; ZVFHMIN32-NEXT:    lh a0, 600(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 237(sp)
+; ZVFHMIN32-NEXT:    lh a1, 600(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 344(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 236(sp)
-; ZVFHMIN32-NEXT:    lh a0, 598(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 236(sp)
+; ZVFHMIN32-NEXT:    lh a1, 598(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 342(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 235(sp)
-; ZVFHMIN32-NEXT:    lh a0, 596(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 235(sp)
+; ZVFHMIN32-NEXT:    lh a1, 596(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 340(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 234(sp)
-; ZVFHMIN32-NEXT:    lh a0, 594(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 234(sp)
+; ZVFHMIN32-NEXT:    lh a1, 594(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 338(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    sb a0, 233(sp)
-; ZVFHMIN32-NEXT:    lh a0, 592(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    lh t5, 336(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, t5
-; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN32-NEXT:    sb a0, 232(sp)
-; ZVFHMIN32-NEXT:    lh a0, 590(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa2, a3
-; ZVFHMIN32-NEXT:    lh a2, 334(sp)
-; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa3
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    feq.h t6, fa4, fa2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
-; ZVFHMIN32-NEXT:    sb a0, 231(sp)
-; ZVFHMIN32-NEXT:    lh a0, 588(sp)
-; ZVFHMIN32-NEXT:    lh a2, 332(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN32-NEXT:    sb a0, 230(sp)
-; ZVFHMIN32-NEXT:    lh a0, 586(sp)
-; ZVFHMIN32-NEXT:    lh a2, 330(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s8
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN32-NEXT:    sb a0, 229(sp)
-; ZVFHMIN32-NEXT:    lh a0, 584(sp)
-; ZVFHMIN32-NEXT:    lh a2, 328(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN32-NEXT:    feq.h s4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    sb a0, 228(sp)
-; ZVFHMIN32-NEXT:    lh a0, 582(sp)
-; ZVFHMIN32-NEXT:    lh a2, 326(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN32-NEXT:    sb a0, 227(sp)
-; ZVFHMIN32-NEXT:    lh a0, 580(sp)
-; ZVFHMIN32-NEXT:    lh a2, 324(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s7
-; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 226(sp)
-; ZVFHMIN32-NEXT:    lh a0, 578(sp)
+; ZVFHMIN32-NEXT:    sb a1, 233(sp)
+; ZVFHMIN32-NEXT:    lh a1, 592(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 336(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 232(sp)
+; ZVFHMIN32-NEXT:    lh a1, 590(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 334(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 231(sp)
+; ZVFHMIN32-NEXT:    lh a1, 588(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 332(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 230(sp)
+; ZVFHMIN32-NEXT:    lh a1, 586(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 229(sp)
+; ZVFHMIN32-NEXT:    lh a1, 584(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 328(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 228(sp)
+; ZVFHMIN32-NEXT:    lh a1, 582(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 326(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 227(sp)
+; ZVFHMIN32-NEXT:    lh a1, 580(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 324(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    sb a1, 226(sp)
+; ZVFHMIN32-NEXT:    lh a1, 578(sp)
 ; ZVFHMIN32-NEXT:    lh a2, 322(sp)
-; ZVFHMIN32-NEXT:    sb s2, 193(sp)
-; ZVFHMIN32-NEXT:    sb a1, 194(sp)
-; ZVFHMIN32-NEXT:    sb s4, 195(sp)
-; ZVFHMIN32-NEXT:    sb a4, 196(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    sb t6, 193(sp)
+; ZVFHMIN32-NEXT:    sb t5, 194(sp)
+; ZVFHMIN32-NEXT:    sb t4, 195(sp)
+; ZVFHMIN32-NEXT:    sb t3, 196(sp)
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 197(sp)
-; ZVFHMIN32-NEXT:    sb t6, 198(sp)
-; ZVFHMIN32-NEXT:    sb t5, 199(sp)
-; ZVFHMIN32-NEXT:    sb a0, 225(sp)
-; ZVFHMIN32-NEXT:    lh a0, 766(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 197(sp)
+; ZVFHMIN32-NEXT:    sb t1, 198(sp)
+; ZVFHMIN32-NEXT:    sb t0, 199(sp)
+; ZVFHMIN32-NEXT:    sb a1, 225(sp)
+; ZVFHMIN32-NEXT:    lh a1, 766(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 14
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 191(sp)
-; ZVFHMIN32-NEXT:    lh a0, 764(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 191(sp)
+; ZVFHMIN32-NEXT:    lh a1, 764(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 190(sp)
-; ZVFHMIN32-NEXT:    lh a0, 762(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 190(sp)
+; ZVFHMIN32-NEXT:    lh a1, 762(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 506(sp)
-; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a3, 3
-; ZVFHMIN32-NEXT:    add a3, sp, a3
-; ZVFHMIN32-NEXT:    addi a3, a3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    li s3, 6
-; ZVFHMIN32-NEXT:    mul a4, a4, s3
-; ZVFHMIN32-NEXT:    add a4, sp, a4
-; ZVFHMIN32-NEXT:    addi a4, a4, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 189(sp)
-; ZVFHMIN32-NEXT:    lh a0, 760(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 189(sp)
+; ZVFHMIN32-NEXT:    lh a1, 760(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 504(sp)
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 12
-; ZVFHMIN32-NEXT:    mul s3, s3, s4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 10
-; ZVFHMIN32-NEXT:    mul s3, s3, s4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 188(sp)
-; ZVFHMIN32-NEXT:    lh a0, 758(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 188(sp)
+; ZVFHMIN32-NEXT:    lh a1, 758(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 502(sp)
-; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    slli s3, s3, 4
-; ZVFHMIN32-NEXT:    add s3, sp, s3
-; ZVFHMIN32-NEXT:    addi s3, s3, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    sb a0, 187(sp)
-; ZVFHMIN32-NEXT:    lh a0, 756(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 187(sp)
+; ZVFHMIN32-NEXT:    lh a1, 756(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 500(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
-; ZVFHMIN32-NEXT:    sb a0, 186(sp)
-; ZVFHMIN32-NEXT:    lh a0, 754(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 186(sp)
+; ZVFHMIN32-NEXT:    lh a1, 754(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 498(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    sb a0, 185(sp)
-; ZVFHMIN32-NEXT:    lh a0, 752(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 185(sp)
+; ZVFHMIN32-NEXT:    lh a1, 752(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 496(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    sb a0, 184(sp)
-; ZVFHMIN32-NEXT:    lh a0, 750(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 184(sp)
+; ZVFHMIN32-NEXT:    lh a1, 750(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 494(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    sb a0, 183(sp)
-; ZVFHMIN32-NEXT:    lh a0, 748(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 183(sp)
+; ZVFHMIN32-NEXT:    lh a1, 748(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 492(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    sb a0, 182(sp)
-; ZVFHMIN32-NEXT:    lh a0, 746(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 182(sp)
+; ZVFHMIN32-NEXT:    lh a1, 746(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 490(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    sb a0, 181(sp)
-; ZVFHMIN32-NEXT:    lh a0, 744(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 181(sp)
+; ZVFHMIN32-NEXT:    lh a1, 744(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    lh a1, 488(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    addi a1, sp, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN32-NEXT:    sb a0, 180(sp)
-; ZVFHMIN32-NEXT:    lh a0, 742(sp)
-; ZVFHMIN32-NEXT:    lh a7, 486(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 180(sp)
+; ZVFHMIN32-NEXT:    lh a1, 742(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    lh a1, 486(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 179(sp)
-; ZVFHMIN32-NEXT:    lh a0, 740(sp)
-; ZVFHMIN32-NEXT:    lh a7, 484(sp)
-; ZVFHMIN32-NEXT:    sb a2, 140(sp)
-; ZVFHMIN32-NEXT:    sb t1, 141(sp)
-; ZVFHMIN32-NEXT:    sb t3, 142(sp)
-; ZVFHMIN32-NEXT:    sb t4, 143(sp)
-; ZVFHMIN32-NEXT:    sb a1, 136(sp)
-; ZVFHMIN32-NEXT:    sb a6, 137(sp)
-; ZVFHMIN32-NEXT:    sb a4, 138(sp)
-; ZVFHMIN32-NEXT:    sb a3, 139(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    lw a2, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    lw a2, 116(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    sb a1, 179(sp)
+; ZVFHMIN32-NEXT:    lh a2, 740(sp)
+; ZVFHMIN32-NEXT:    lh a3, 484(sp)
+; ZVFHMIN32-NEXT:    sb s2, 140(sp)
+; ZVFHMIN32-NEXT:    sb a6, 141(sp)
+; ZVFHMIN32-NEXT:    sb a5, 142(sp)
+; ZVFHMIN32-NEXT:    sb a0, 143(sp)
+; ZVFHMIN32-NEXT:    sb ra, 136(sp)
+; ZVFHMIN32-NEXT:    sb s9, 137(sp)
+; ZVFHMIN32-NEXT:    sb s10, 138(sp)
+; ZVFHMIN32-NEXT:    sb s11, 139(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 638(sp)
-; ZVFHMIN32-NEXT:    lh a1, 382(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 382(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 255(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 636(sp)
-; ZVFHMIN32-NEXT:    lh a1, 380(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 380(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 254(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 634(sp)
-; ZVFHMIN32-NEXT:    lh a1, 378(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 378(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 253(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 632(sp)
-; ZVFHMIN32-NEXT:    lh a1, 376(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 376(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 252(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 630(sp)
-; ZVFHMIN32-NEXT:    lh a1, 374(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 374(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 251(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 628(sp)
-; ZVFHMIN32-NEXT:    lh a1, 372(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 372(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 626(sp)
-; ZVFHMIN32-NEXT:    lh a1, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 249(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 624(sp)
-; ZVFHMIN32-NEXT:    lh a1, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 248(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 622(sp)
-; ZVFHMIN32-NEXT:    lh a1, 366(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 366(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 247(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 620(sp)
-; ZVFHMIN32-NEXT:    lh a1, 364(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 364(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN32-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 618(sp)
-; ZVFHMIN32-NEXT:    lh a1, 362(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 362(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
 ; ZVFHMIN32-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 616(sp)
-; ZVFHMIN32-NEXT:    lh a1, 360(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 360(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN32-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 614(sp)
-; ZVFHMIN32-NEXT:    lh a1, 358(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    lh a0, 358(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    lw a2, 124(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 612(sp)
-; ZVFHMIN32-NEXT:    lh a1, 356(sp)
-; ZVFHMIN32-NEXT:    sb a5, 204(sp)
-; ZVFHMIN32-NEXT:    sb a4, 205(sp)
-; ZVFHMIN32-NEXT:    sb a2, 206(sp)
-; ZVFHMIN32-NEXT:    sb a3, 207(sp)
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 200(sp)
-; ZVFHMIN32-NEXT:    sb a6, 201(sp)
-; ZVFHMIN32-NEXT:    sb a7, 202(sp)
-; ZVFHMIN32-NEXT:    sb t0, 203(sp)
-; ZVFHMIN32-NEXT:    li a2, 128
+; ZVFHMIN32-NEXT:    lh a2, 356(sp)
+; ZVFHMIN32-NEXT:    sb s6, 204(sp)
+; ZVFHMIN32-NEXT:    sb s8, 205(sp)
+; ZVFHMIN32-NEXT:    sb s7, 206(sp)
+; ZVFHMIN32-NEXT:    sb s3, 207(sp)
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 200(sp)
+; ZVFHMIN32-NEXT:    sb a1, 201(sp)
+; ZVFHMIN32-NEXT:    sb a4, 202(sp)
+; ZVFHMIN32-NEXT:    sb s4, 203(sp)
+; ZVFHMIN32-NEXT:    li a1, 128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 242(sp)
 ; ZVFHMIN32-NEXT:    addi a0, sp, 128
-; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN32-NEXT:    addi sp, s0, -896
-; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 896
-; ZVFHMIN32-NEXT:    lw ra, 892(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s0, 888(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s2, 884(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s3, 880(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s4, 876(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s5, 872(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s6, 868(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s7, 864(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s8, 860(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s9, 856(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s10, 852(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s11, 848(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    addi sp, s0, -1024
+; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 1024
+; ZVFHMIN32-NEXT:    lw ra, 1020(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s0, 1016(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s2, 1012(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s3, 1008(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s4, 1004(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s5, 1000(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s6, 996(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s7, 992(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s8, 988(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s9, 984(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s10, 980(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s11, 976(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs0, 968(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs1, 960(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs2, 952(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs3, 944(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs4, 936(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs5, 928(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs6, 920(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs7, 912(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs8, 904(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs9, 896(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs10, 888(sp) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fld fs11, 880(sp) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    .cfi_restore ra
 ; ZVFHMIN32-NEXT:    .cfi_restore s0
 ; ZVFHMIN32-NEXT:    .cfi_restore s2
@@ -2242,26 +2333,50 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    .cfi_restore s9
 ; ZVFHMIN32-NEXT:    .cfi_restore s10
 ; ZVFHMIN32-NEXT:    .cfi_restore s11
-; ZVFHMIN32-NEXT:    addi sp, sp, 896
+; ZVFHMIN32-NEXT:    .cfi_restore fs0
+; ZVFHMIN32-NEXT:    .cfi_restore fs1
+; ZVFHMIN32-NEXT:    .cfi_restore fs2
+; ZVFHMIN32-NEXT:    .cfi_restore fs3
+; ZVFHMIN32-NEXT:    .cfi_restore fs4
+; ZVFHMIN32-NEXT:    .cfi_restore fs5
+; ZVFHMIN32-NEXT:    .cfi_restore fs6
+; ZVFHMIN32-NEXT:    .cfi_restore fs7
+; ZVFHMIN32-NEXT:    .cfi_restore fs8
+; ZVFHMIN32-NEXT:    .cfi_restore fs9
+; ZVFHMIN32-NEXT:    .cfi_restore fs10
+; ZVFHMIN32-NEXT:    .cfi_restore fs11
+; ZVFHMIN32-NEXT:    addi sp, sp, 1024
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN32-NEXT:    ret
 ;
 ; ZVFHMIN64-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    addi sp, sp, -896
-; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 896
-; ZVFHMIN64-NEXT:    sd ra, 888(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s0, 880(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s2, 872(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s3, 864(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s4, 856(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s5, 848(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s6, 840(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s7, 832(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s8, 824(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s9, 816(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s10, 808(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s11, 800(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    addi sp, sp, -1024
+; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 1024
+; ZVFHMIN64-NEXT:    sd ra, 1016(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s0, 1008(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s2, 1000(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s3, 992(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s4, 984(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s5, 976(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s6, 968(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s7, 960(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s8, 952(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s9, 944(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s10, 936(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s11, 928(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs0, 920(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs1, 912(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs2, 904(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs3, 896(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs4, 888(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs5, 880(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs6, 872(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs7, 864(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs8, 856(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs9, 848(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs10, 840(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fsd fs11, 832(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    .cfi_offset ra, -8
 ; ZVFHMIN64-NEXT:    .cfi_offset s0, -16
 ; ZVFHMIN64-NEXT:    .cfi_offset s2, -24
@@ -2274,1096 +2389,1175 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    .cfi_offset s9, -80
 ; ZVFHMIN64-NEXT:    .cfi_offset s10, -88
 ; ZVFHMIN64-NEXT:    .cfi_offset s11, -96
-; ZVFHMIN64-NEXT:    addi s0, sp, 896
+; ZVFHMIN64-NEXT:    .cfi_offset fs0, -104
+; ZVFHMIN64-NEXT:    .cfi_offset fs1, -112
+; ZVFHMIN64-NEXT:    .cfi_offset fs2, -120
+; ZVFHMIN64-NEXT:    .cfi_offset fs3, -128
+; ZVFHMIN64-NEXT:    .cfi_offset fs4, -136
+; ZVFHMIN64-NEXT:    .cfi_offset fs5, -144
+; ZVFHMIN64-NEXT:    .cfi_offset fs6, -152
+; ZVFHMIN64-NEXT:    .cfi_offset fs7, -160
+; ZVFHMIN64-NEXT:    .cfi_offset fs8, -168
+; ZVFHMIN64-NEXT:    .cfi_offset fs9, -176
+; ZVFHMIN64-NEXT:    .cfi_offset fs10, -184
+; ZVFHMIN64-NEXT:    .cfi_offset fs11, -192
+; ZVFHMIN64-NEXT:    addi s0, sp, 1024
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN64-NEXT:    csrr a1, vlenb
-; ZVFHMIN64-NEXT:    li a2, 30
+; ZVFHMIN64-NEXT:    li a2, 41
 ; ZVFHMIN64-NEXT:    mul a1, a1, a2
 ; ZVFHMIN64-NEXT:    sub sp, sp, a1
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
-; ZVFHMIN64-NEXT:    addi a1, a0, 128
-; ZVFHMIN64-NEXT:    li a2, 64
-; ZVFHMIN64-NEXT:    addi a3, sp, 640
-; ZVFHMIN64-NEXT:    addi a4, sp, 384
-; ZVFHMIN64-NEXT:    addi a5, sp, 512
-; ZVFHMIN64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; ZVFHMIN64-NEXT:    addi a3, a0, 128
+; ZVFHMIN64-NEXT:    li a1, 64
+; ZVFHMIN64-NEXT:    addi a4, sp, 640
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vle16.v v24, (a3)
+; ZVFHMIN64-NEXT:    csrr a3, vlenb
+; ZVFHMIN64-NEXT:    slli a5, a3, 5
+; ZVFHMIN64-NEXT:    add a3, a5, a3
+; ZVFHMIN64-NEXT:    add a3, sp, a3
+; ZVFHMIN64-NEXT:    addi a3, a3, 832
+; ZVFHMIN64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 256
-; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
-; ZVFHMIN64-NEXT:    vse16.v v8, (a3)
-; ZVFHMIN64-NEXT:    vse16.v v0, (a4)
-; ZVFHMIN64-NEXT:    vse16.v v16, (a5)
-; ZVFHMIN64-NEXT:    vse16.v v24, (a0)
-; ZVFHMIN64-NEXT:    lh a0, 704(sp)
+; ZVFHMIN64-NEXT:    vse16.v v8, (a4)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a0, a0, 5
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 5
+; ZVFHMIN64-NEXT:    sub a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 30
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 29
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 28
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 15
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 14
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 13
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 11
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 3
+; ZVFHMIN64-NEXT:    add a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 10
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 3
+; ZVFHMIN64-NEXT:    sub a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 9
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 2
+; ZVFHMIN64-NEXT:    add a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 8
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 1
+; ZVFHMIN64-NEXT:    add a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    addi a0, sp, 384
+; ZVFHMIN64-NEXT:    addi a3, sp, 512
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v16
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN64-NEXT:    vse16.v v0, (a0)
+; ZVFHMIN64-NEXT:    vse16.v v16, (a3)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 7
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 11
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 6
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 5
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 13
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 4
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 4
+; ZVFHMIN64-NEXT:    sub a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a0, a0, 4
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 4
+; ZVFHMIN64-NEXT:    add a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 15
+; ZVFHMIN64-NEXT:    addi a0, sp, 832
+; ZVFHMIN64-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v24, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v16, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v16, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v16, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v0
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v17, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v23, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v19, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v0, 1
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a0, a0, 1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v0, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v0, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v0, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v26, v0, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v0, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v0, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a0, a0, a3
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v6
+; ZVFHMIN64-NEXT:    csrr a3, vlenb
+; ZVFHMIN64-NEXT:    li a4, 22
+; ZVFHMIN64-NEXT:    mul a3, a3, a4
+; ZVFHMIN64-NEXT:    add a3, sp, a3
+; ZVFHMIN64-NEXT:    addi a3, a3, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    li a7, 20
+; ZVFHMIN64-NEXT:    mul a4, a4, a7
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    li t0, 18
+; ZVFHMIN64-NEXT:    mul a4, a4, t0
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    slli t0, a4, 3
+; ZVFHMIN64-NEXT:    add a4, t0, a4
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    slli t0, a4, 3
+; ZVFHMIN64-NEXT:    sub a4, t0, a4
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    slli t0, a4, 2
+; ZVFHMIN64-NEXT:    add a4, t0, a4
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v6
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    slli t0, a4, 1
+; ZVFHMIN64-NEXT:    add a4, t0, a4
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v6
+; ZVFHMIN64-NEXT:    addi a4, sp, 832
+; ZVFHMIN64-NEXT:    vl2r.v v6, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v6
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v4
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v2
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v24
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v22
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v20
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v18
+; ZVFHMIN64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v16
+; ZVFHMIN64-NEXT:    sd t0, 120(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v12
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v14
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v26
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v28
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v30
+; ZVFHMIN64-NEXT:    fmv.h.x fs8, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fs7, a5
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs6, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a5, a2, 5
+; ZVFHMIN64-NEXT:    sub a2, a5, a2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs5, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 30
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft10, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 29
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft8, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 28
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 27
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 26
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 11
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft5, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x ft6, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 13
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa6, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a5, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a5
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs0, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a5, a2, 4
+; ZVFHMIN64-NEXT:    sub a2, a5, a2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs1, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs2, a2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a5, a2, 4
+; ZVFHMIN64-NEXT:    add a2, a5, a2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh a2, 832(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fs3, a2
+; ZVFHMIN64-NEXT:    addi a2, sp, 256
+; ZVFHMIN64-NEXT:    fmv.h.x fs4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x ft7, a3
+; ZVFHMIN64-NEXT:    fmv.h.x ft11, a7
+; ZVFHMIN64-NEXT:    fmv.h.x ft9, s3
+; ZVFHMIN64-NEXT:    fmv.h.x fa7, s10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s11
+; ZVFHMIN64-NEXT:    fsh fa5, 102(sp) # 2-byte Folded Spill
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a0, 5
+; ZVFHMIN64-NEXT:    add a0, a3, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    addi a0, a0, 832
+; ZVFHMIN64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN64-NEXT:    vse16.v v24, (a2)
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v0
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s5
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v5
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s7
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v17
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, s9
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v23
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, s8
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v19
+; ZVFHMIN64-NEXT:    fmv.h.x ft0, s6
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v21
+; ZVFHMIN64-NEXT:    fmv.h.x ft1, s4
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    fsh fa5, 100(sp) # 2-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fs9, a6
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a0, a0, 1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh a6, 832(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    fmv.h.x fs10, s2
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    fmv.h.x fs11, s5
+; ZVFHMIN64-NEXT:    feq.h s2, fs8, fs9
+; ZVFHMIN64-NEXT:    fmv.h.x fs8, s7
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    fmv.h.x fs9, s9
+; ZVFHMIN64-NEXT:    feq.h s11, fs7, fs10
+; ZVFHMIN64-NEXT:    fmv.h.x fs7, s8
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    fmv.h.x fs10, s6
+; ZVFHMIN64-NEXT:    feq.h s4, fs6, fs11
+; ZVFHMIN64-NEXT:    fmv.h.x fs6, s10
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    fmv.h.x fs11, a6
+; ZVFHMIN64-NEXT:    feq.h s5, fs5, fs8
+; ZVFHMIN64-NEXT:    fmv.h.x fs5, a0
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    fmv.h.x fs8, s7
+; ZVFHMIN64-NEXT:    feq.h s6, ft10, fs9
+; ZVFHMIN64-NEXT:    fmv.h.x fs9, s8
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    feq.h s7, ft8, fs7
+; ZVFHMIN64-NEXT:    fmv.h.x fs7, a0
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN64-NEXT:    feq.h s8, ft2, fs10
+; ZVFHMIN64-NEXT:    fmv.h.x fs10, a0
+; ZVFHMIN64-NEXT:    feq.h s9, ft3, fs6
+; ZVFHMIN64-NEXT:    fmv.h.x fs6, t6
+; ZVFHMIN64-NEXT:    feq.h s10, ft4, fs11
+; ZVFHMIN64-NEXT:    fmv.h.x fs11, t0
+; ZVFHMIN64-NEXT:    feq.h t0, ft5, fs5
+; ZVFHMIN64-NEXT:    fmv.h.x fs5, t1
+; ZVFHMIN64-NEXT:    feq.h t1, ft6, fs8
+; ZVFHMIN64-NEXT:    fmv.h.x ft10, t2
+; ZVFHMIN64-NEXT:    feq.h t2, fa6, fs9
+; ZVFHMIN64-NEXT:    fmv.h.x ft8, t3
+; ZVFHMIN64-NEXT:    feq.h t3, fs0, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x ft2, t4
+; ZVFHMIN64-NEXT:    feq.h t4, fs1, fs7
+; ZVFHMIN64-NEXT:    fmv.h.x ft3, t5
+; ZVFHMIN64-NEXT:    feq.h t5, fs2, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x ft4, a3
+; ZVFHMIN64-NEXT:    feq.h t6, fs3, fs10
+; ZVFHMIN64-NEXT:    fmv.h.x ft5, a5
+; ZVFHMIN64-NEXT:    feq.h a0, fs4, fs6
+; ZVFHMIN64-NEXT:    fmv.h.x ft6, ra
+; ZVFHMIN64-NEXT:    feq.h a5, ft7, fs11
+; ZVFHMIN64-NEXT:    fmv.h.x ft7, a2
+; ZVFHMIN64-NEXT:    lh a2, 704(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa6, a1
+; ZVFHMIN64-NEXT:    feq.h a6, ft11, fs5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
 ; ZVFHMIN64-NEXT:    lh a1, 448(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 160(sp)
-; ZVFHMIN64-NEXT:    lh a0, 702(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 160(sp)
+; ZVFHMIN64-NEXT:    lh a1, 702(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 446(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 159(sp)
-; ZVFHMIN64-NEXT:    lh a0, 700(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 159(sp)
+; ZVFHMIN64-NEXT:    lh a1, 700(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 444(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 158(sp)
-; ZVFHMIN64-NEXT:    lh a0, 698(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 158(sp)
+; ZVFHMIN64-NEXT:    lh a1, 698(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 442(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 157(sp)
-; ZVFHMIN64-NEXT:    lh a0, 696(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 157(sp)
+; ZVFHMIN64-NEXT:    lh a1, 696(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 440(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 156(sp)
-; ZVFHMIN64-NEXT:    lh a0, 694(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 156(sp)
+; ZVFHMIN64-NEXT:    lh a1, 694(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 438(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 155(sp)
-; ZVFHMIN64-NEXT:    lh a0, 692(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 155(sp)
+; ZVFHMIN64-NEXT:    lh a1, 692(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 436(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 154(sp)
-; ZVFHMIN64-NEXT:    lh a0, 690(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 154(sp)
+; ZVFHMIN64-NEXT:    lh a1, 690(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 434(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 153(sp)
-; ZVFHMIN64-NEXT:    lh a0, 688(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 153(sp)
+; ZVFHMIN64-NEXT:    lh a1, 688(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 432(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 152(sp)
-; ZVFHMIN64-NEXT:    lh a0, 686(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 152(sp)
+; ZVFHMIN64-NEXT:    lh a1, 686(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 430(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 151(sp)
-; ZVFHMIN64-NEXT:    lh a0, 684(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 151(sp)
+; ZVFHMIN64-NEXT:    lh a1, 684(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 428(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 150(sp)
-; ZVFHMIN64-NEXT:    lh a0, 682(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 150(sp)
+; ZVFHMIN64-NEXT:    lh a1, 682(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 426(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 149(sp)
-; ZVFHMIN64-NEXT:    lh a0, 680(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 149(sp)
+; ZVFHMIN64-NEXT:    lh a1, 680(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 424(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 148(sp)
-; ZVFHMIN64-NEXT:    lh a0, 678(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 148(sp)
+; ZVFHMIN64-NEXT:    lh a1, 678(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 422(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 147(sp)
-; ZVFHMIN64-NEXT:    lh a0, 676(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 147(sp)
+; ZVFHMIN64-NEXT:    lh a1, 676(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 420(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 146(sp)
-; ZVFHMIN64-NEXT:    lh a0, 674(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 146(sp)
+; ZVFHMIN64-NEXT:    lh a1, 674(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 418(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT:    sb a0, 145(sp)
-; ZVFHMIN64-NEXT:    lh a0, 672(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 145(sp)
+; ZVFHMIN64-NEXT:    lh a1, 672(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 416(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 128(sp)
-; ZVFHMIN64-NEXT:    sb a0, 144(sp)
-; ZVFHMIN64-NEXT:    lh a0, 576(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s2, 128(sp)
+; ZVFHMIN64-NEXT:    feq.h s2, ft9, ft10
+; ZVFHMIN64-NEXT:    sb a1, 144(sp)
+; ZVFHMIN64-NEXT:    lh a1, 576(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 320(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 224(sp)
-; ZVFHMIN64-NEXT:    lh a0, 574(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 224(sp)
+; ZVFHMIN64-NEXT:    lh a1, 574(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 318(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 223(sp)
-; ZVFHMIN64-NEXT:    lh a0, 572(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 223(sp)
+; ZVFHMIN64-NEXT:    lh a1, 572(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 316(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 222(sp)
-; ZVFHMIN64-NEXT:    lh a0, 570(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 222(sp)
+; ZVFHMIN64-NEXT:    lh a1, 570(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 314(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 221(sp)
-; ZVFHMIN64-NEXT:    lh a0, 568(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 221(sp)
+; ZVFHMIN64-NEXT:    lh a1, 568(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 312(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 220(sp)
-; ZVFHMIN64-NEXT:    lh a0, 566(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 220(sp)
+; ZVFHMIN64-NEXT:    lh a1, 566(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 310(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 219(sp)
-; ZVFHMIN64-NEXT:    lh a0, 564(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 219(sp)
+; ZVFHMIN64-NEXT:    lh a1, 564(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 308(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 218(sp)
-; ZVFHMIN64-NEXT:    lh a0, 562(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 218(sp)
+; ZVFHMIN64-NEXT:    lh a1, 562(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 306(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 29
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 28
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 27
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 26
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 25
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 24
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 23
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 217(sp)
-; ZVFHMIN64-NEXT:    lh a0, 560(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 217(sp)
+; ZVFHMIN64-NEXT:    lh a1, 560(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 216(sp)
-; ZVFHMIN64-NEXT:    lh a0, 558(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 216(sp)
+; ZVFHMIN64-NEXT:    lh a1, 558(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 2
-; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 6
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 10
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 4
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 215(sp)
-; ZVFHMIN64-NEXT:    lh a0, 556(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 215(sp)
+; ZVFHMIN64-NEXT:    lh a1, 556(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 214(sp)
-; ZVFHMIN64-NEXT:    lh a0, 554(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 214(sp)
+; ZVFHMIN64-NEXT:    lh a1, 554(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 213(sp)
-; ZVFHMIN64-NEXT:    lh a0, 552(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 213(sp)
+; ZVFHMIN64-NEXT:    lh a1, 552(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 212(sp)
-; ZVFHMIN64-NEXT:    lh a0, 550(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 212(sp)
+; ZVFHMIN64-NEXT:    lh a1, 550(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v22
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
-; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 211(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 211(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 548(sp)
-; ZVFHMIN64-NEXT:    lh t5, 292(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v14
-; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    lh a1, 292(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a1, 210(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 546(sp)
-; ZVFHMIN64-NEXT:    lh t5, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 290(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a1, 209(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 544(sp)
-; ZVFHMIN64-NEXT:    lh t5, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    lh a1, 288(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 192(sp)
+; ZVFHMIN64-NEXT:    sb s11, 192(sp)
+; ZVFHMIN64-NEXT:    feq.h s11, fa7, ft8
 ; ZVFHMIN64-NEXT:    sb a1, 208(sp)
-; ZVFHMIN64-NEXT:    lh t5, 738(sp)
-; ZVFHMIN64-NEXT:    lh t6, 482(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
-; ZVFHMIN64-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 177(sp)
-; ZVFHMIN64-NEXT:    lh t5, 736(sp)
-; ZVFHMIN64-NEXT:    lh t6, 480(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 29
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 28
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 176(sp)
-; ZVFHMIN64-NEXT:    lh t5, 734(sp)
-; ZVFHMIN64-NEXT:    lh t6, 478(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 27
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 26
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 175(sp)
-; ZVFHMIN64-NEXT:    lh t5, 732(sp)
-; ZVFHMIN64-NEXT:    lh t6, 476(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 25
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 24
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t5, 174(sp)
-; ZVFHMIN64-NEXT:    lh t6, 730(sp)
-; ZVFHMIN64-NEXT:    lh s9, 474(sp)
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 23
-; ZVFHMIN64-NEXT:    mul a0, a0, a1
-; ZVFHMIN64-NEXT:    add a0, sp, a0
-; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t6, 173(sp)
-; ZVFHMIN64-NEXT:    lh s9, 728(sp)
-; ZVFHMIN64-NEXT:    lh s10, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v13
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s9, 172(sp)
-; ZVFHMIN64-NEXT:    lh s9, 726(sp)
-; ZVFHMIN64-NEXT:    lh s10, 470(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v11
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
-; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s9, 171(sp)
-; ZVFHMIN64-NEXT:    lh s10, 724(sp)
-; ZVFHMIN64-NEXT:    lh s11, 468(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v9
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
-; ZVFHMIN64-NEXT:    feq.h s10, fa5, fa4
-; ZVFHMIN64-NEXT:    sb s10, 170(sp)
-; ZVFHMIN64-NEXT:    lh a0, 722(sp)
+; ZVFHMIN64-NEXT:    lh a1, 738(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 482(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 177(sp)
+; ZVFHMIN64-NEXT:    lh a1, 736(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 480(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 176(sp)
+; ZVFHMIN64-NEXT:    lh a1, 734(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 478(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 175(sp)
+; ZVFHMIN64-NEXT:    lh a1, 732(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 476(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 174(sp)
+; ZVFHMIN64-NEXT:    lh a1, 730(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 474(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 173(sp)
+; ZVFHMIN64-NEXT:    lh a1, 728(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 472(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 172(sp)
+; ZVFHMIN64-NEXT:    lh a1, 726(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 470(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 171(sp)
+; ZVFHMIN64-NEXT:    lh a1, 724(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 468(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 170(sp)
+; ZVFHMIN64-NEXT:    lh a1, 722(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v27
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 169(sp)
-; ZVFHMIN64-NEXT:    lh a0, 720(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 169(sp)
+; ZVFHMIN64-NEXT:    lh a1, 720(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 464(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 168(sp)
-; ZVFHMIN64-NEXT:    lh a0, 718(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 168(sp)
+; ZVFHMIN64-NEXT:    lh a1, 718(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 462(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s7
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s8
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
-; ZVFHMIN64-NEXT:    fmv.h.x fa1, ra
-; ZVFHMIN64-NEXT:    sb a0, 167(sp)
-; ZVFHMIN64-NEXT:    lh a0, 716(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 167(sp)
+; ZVFHMIN64-NEXT:    lh a1, 716(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 460(sp)
-; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa1
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
 ; ZVFHMIN64-NEXT:    sb a1, 166(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 714(sp)
-; ZVFHMIN64-NEXT:    lh a2, 458(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 458(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a1, 165(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 712(sp)
-; ZVFHMIN64-NEXT:    lh a2, 456(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
-; ZVFHMIN64-NEXT:    feq.h a4, fa2, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a1, 164(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 710(sp)
-; ZVFHMIN64-NEXT:    lh a2, 454(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s9
-; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a1, 163(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 708(sp)
-; ZVFHMIN64-NEXT:    lh a2, 452(sp)
-; ZVFHMIN64-NEXT:    feq.h s3, fa4, fa5
-; ZVFHMIN64-NEXT:    feq.h s4, fa3, fa2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    lh a1, 452(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
 ; ZVFHMIN64-NEXT:    sb a1, 162(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 706(sp)
 ; ZVFHMIN64-NEXT:    lh a2, 450(sp)
-; ZVFHMIN64-NEXT:    sb s4, 129(sp)
-; ZVFHMIN64-NEXT:    sb s3, 130(sp)
-; ZVFHMIN64-NEXT:    sb s2, 131(sp)
-; ZVFHMIN64-NEXT:    sb a4, 132(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 133(sp)
-; ZVFHMIN64-NEXT:    sb a0, 134(sp)
-; ZVFHMIN64-NEXT:    sb s5, 135(sp)
+; ZVFHMIN64-NEXT:    sb s10, 129(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 102(sp) # 2-byte Folded Reload
+; ZVFHMIN64-NEXT:    feq.h s10, fa4, ft2
+; ZVFHMIN64-NEXT:    sb s9, 130(sp)
+; ZVFHMIN64-NEXT:    feq.h s9, fa3, ft3
+; ZVFHMIN64-NEXT:    sb s8, 131(sp)
+; ZVFHMIN64-NEXT:    feq.h ra, fa2, ft4
+; ZVFHMIN64-NEXT:    sb s7, 132(sp)
+; ZVFHMIN64-NEXT:    feq.h s3, fa1, ft5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h s7, fa0, ft6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN64-NEXT:    feq.h s8, ft0, ft7
+; ZVFHMIN64-NEXT:    sb s6, 133(sp)
+; ZVFHMIN64-NEXT:    feq.h s6, ft1, fa6
+; ZVFHMIN64-NEXT:    sb s5, 134(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    sb s4, 135(sp)
+; ZVFHMIN64-NEXT:    flh fa4, 100(sp) # 2-byte Folded Reload
+; ZVFHMIN64-NEXT:    feq.h s4, fa4, fa5
 ; ZVFHMIN64-NEXT:    sb a1, 161(sp)
-; ZVFHMIN64-NEXT:    lh a0, 610(sp)
+; ZVFHMIN64-NEXT:    lh a1, 610(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v5
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 241(sp)
-; ZVFHMIN64-NEXT:    lh a0, 608(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 241(sp)
+; ZVFHMIN64-NEXT:    lh a1, 608(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 240(sp)
-; ZVFHMIN64-NEXT:    lh a0, 606(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 240(sp)
+; ZVFHMIN64-NEXT:    lh a1, 606(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
-; ZVFHMIN64-NEXT:    sb a0, 239(sp)
-; ZVFHMIN64-NEXT:    lh a0, 604(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 239(sp)
+; ZVFHMIN64-NEXT:    lh a1, 604(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 348(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 238(sp)
-; ZVFHMIN64-NEXT:    lh a0, 602(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 238(sp)
+; ZVFHMIN64-NEXT:    lh a1, 602(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 346(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 237(sp)
-; ZVFHMIN64-NEXT:    lh a0, 600(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 237(sp)
+; ZVFHMIN64-NEXT:    lh a1, 600(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 344(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 236(sp)
-; ZVFHMIN64-NEXT:    lh a0, 598(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 236(sp)
+; ZVFHMIN64-NEXT:    lh a1, 598(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 342(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 235(sp)
-; ZVFHMIN64-NEXT:    lh a0, 596(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 235(sp)
+; ZVFHMIN64-NEXT:    lh a1, 596(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 340(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 234(sp)
-; ZVFHMIN64-NEXT:    lh a0, 594(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 234(sp)
+; ZVFHMIN64-NEXT:    lh a1, 594(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 338(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    sb a0, 233(sp)
-; ZVFHMIN64-NEXT:    lh a0, 592(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    lh t5, 336(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
-; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, t5
-; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
-; ZVFHMIN64-NEXT:    sb a0, 232(sp)
-; ZVFHMIN64-NEXT:    lh a0, 590(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa2, a3
-; ZVFHMIN64-NEXT:    lh a2, 334(sp)
-; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa3
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    feq.h t6, fa4, fa2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
-; ZVFHMIN64-NEXT:    sb a0, 231(sp)
-; ZVFHMIN64-NEXT:    lh a0, 588(sp)
-; ZVFHMIN64-NEXT:    lh a2, 332(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
-; ZVFHMIN64-NEXT:    sb a0, 230(sp)
-; ZVFHMIN64-NEXT:    lh a0, 586(sp)
-; ZVFHMIN64-NEXT:    lh a2, 330(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s8
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
-; ZVFHMIN64-NEXT:    sb a0, 229(sp)
-; ZVFHMIN64-NEXT:    lh a0, 584(sp)
-; ZVFHMIN64-NEXT:    lh a2, 328(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
-; ZVFHMIN64-NEXT:    feq.h s4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    sb a0, 228(sp)
-; ZVFHMIN64-NEXT:    lh a0, 582(sp)
-; ZVFHMIN64-NEXT:    lh a2, 326(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
-; ZVFHMIN64-NEXT:    sb a0, 227(sp)
-; ZVFHMIN64-NEXT:    lh a0, 580(sp)
-; ZVFHMIN64-NEXT:    lh a2, 324(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s7
-; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 226(sp)
-; ZVFHMIN64-NEXT:    lh a0, 578(sp)
+; ZVFHMIN64-NEXT:    sb a1, 233(sp)
+; ZVFHMIN64-NEXT:    lh a1, 592(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 336(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 232(sp)
+; ZVFHMIN64-NEXT:    lh a1, 590(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 334(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 231(sp)
+; ZVFHMIN64-NEXT:    lh a1, 588(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 332(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 230(sp)
+; ZVFHMIN64-NEXT:    lh a1, 586(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 229(sp)
+; ZVFHMIN64-NEXT:    lh a1, 584(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 328(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 228(sp)
+; ZVFHMIN64-NEXT:    lh a1, 582(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 326(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 227(sp)
+; ZVFHMIN64-NEXT:    lh a1, 580(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 324(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    sb a1, 226(sp)
+; ZVFHMIN64-NEXT:    lh a1, 578(sp)
 ; ZVFHMIN64-NEXT:    lh a2, 322(sp)
-; ZVFHMIN64-NEXT:    sb s2, 193(sp)
-; ZVFHMIN64-NEXT:    sb a1, 194(sp)
-; ZVFHMIN64-NEXT:    sb s4, 195(sp)
-; ZVFHMIN64-NEXT:    sb a4, 196(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    sb t6, 193(sp)
+; ZVFHMIN64-NEXT:    sb t5, 194(sp)
+; ZVFHMIN64-NEXT:    sb t4, 195(sp)
+; ZVFHMIN64-NEXT:    sb t3, 196(sp)
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 197(sp)
-; ZVFHMIN64-NEXT:    sb t6, 198(sp)
-; ZVFHMIN64-NEXT:    sb t5, 199(sp)
-; ZVFHMIN64-NEXT:    sb a0, 225(sp)
-; ZVFHMIN64-NEXT:    lh a0, 766(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 197(sp)
+; ZVFHMIN64-NEXT:    sb t1, 198(sp)
+; ZVFHMIN64-NEXT:    sb t0, 199(sp)
+; ZVFHMIN64-NEXT:    sb a1, 225(sp)
+; ZVFHMIN64-NEXT:    lh a1, 766(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 14
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 191(sp)
-; ZVFHMIN64-NEXT:    lh a0, 764(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 191(sp)
+; ZVFHMIN64-NEXT:    lh a1, 764(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 190(sp)
-; ZVFHMIN64-NEXT:    lh a0, 762(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 190(sp)
+; ZVFHMIN64-NEXT:    lh a1, 762(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 506(sp)
-; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a3, 3
-; ZVFHMIN64-NEXT:    add a3, sp, a3
-; ZVFHMIN64-NEXT:    addi a3, a3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    li s3, 6
-; ZVFHMIN64-NEXT:    mul a4, a4, s3
-; ZVFHMIN64-NEXT:    add a4, sp, a4
-; ZVFHMIN64-NEXT:    addi a4, a4, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 189(sp)
-; ZVFHMIN64-NEXT:    lh a0, 760(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 189(sp)
+; ZVFHMIN64-NEXT:    lh a1, 760(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 504(sp)
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 12
-; ZVFHMIN64-NEXT:    mul s3, s3, s4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 10
-; ZVFHMIN64-NEXT:    mul s3, s3, s4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 188(sp)
-; ZVFHMIN64-NEXT:    lh a0, 758(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 188(sp)
+; ZVFHMIN64-NEXT:    lh a1, 758(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 502(sp)
-; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    slli s3, s3, 4
-; ZVFHMIN64-NEXT:    add s3, sp, s3
-; ZVFHMIN64-NEXT:    addi s3, s3, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    sb a0, 187(sp)
-; ZVFHMIN64-NEXT:    lh a0, 756(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 187(sp)
+; ZVFHMIN64-NEXT:    lh a1, 756(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 500(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
-; ZVFHMIN64-NEXT:    sb a0, 186(sp)
-; ZVFHMIN64-NEXT:    lh a0, 754(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 186(sp)
+; ZVFHMIN64-NEXT:    lh a1, 754(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 498(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    sb a0, 185(sp)
-; ZVFHMIN64-NEXT:    lh a0, 752(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 185(sp)
+; ZVFHMIN64-NEXT:    lh a1, 752(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 496(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    sb a0, 184(sp)
-; ZVFHMIN64-NEXT:    lh a0, 750(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 184(sp)
+; ZVFHMIN64-NEXT:    lh a1, 750(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 494(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    sb a0, 183(sp)
-; ZVFHMIN64-NEXT:    lh a0, 748(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 183(sp)
+; ZVFHMIN64-NEXT:    lh a1, 748(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 492(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    sb a0, 182(sp)
-; ZVFHMIN64-NEXT:    lh a0, 746(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 182(sp)
+; ZVFHMIN64-NEXT:    lh a1, 746(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 490(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    sb a0, 181(sp)
-; ZVFHMIN64-NEXT:    lh a0, 744(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 181(sp)
+; ZVFHMIN64-NEXT:    lh a1, 744(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    lh a1, 488(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    addi a1, sp, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
-; ZVFHMIN64-NEXT:    sb a0, 180(sp)
-; ZVFHMIN64-NEXT:    lh a0, 742(sp)
-; ZVFHMIN64-NEXT:    lh a7, 486(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 180(sp)
+; ZVFHMIN64-NEXT:    lh a1, 742(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    lh a1, 486(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 179(sp)
-; ZVFHMIN64-NEXT:    lh a0, 740(sp)
-; ZVFHMIN64-NEXT:    lh a7, 484(sp)
-; ZVFHMIN64-NEXT:    sb a2, 140(sp)
-; ZVFHMIN64-NEXT:    sb t1, 141(sp)
-; ZVFHMIN64-NEXT:    sb t3, 142(sp)
-; ZVFHMIN64-NEXT:    sb t4, 143(sp)
-; ZVFHMIN64-NEXT:    sb a1, 136(sp)
-; ZVFHMIN64-NEXT:    sb a6, 137(sp)
-; ZVFHMIN64-NEXT:    sb a4, 138(sp)
-; ZVFHMIN64-NEXT:    sb a3, 139(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    ld a2, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    ld a2, 104(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    sb a1, 179(sp)
+; ZVFHMIN64-NEXT:    lh a2, 740(sp)
+; ZVFHMIN64-NEXT:    lh a3, 484(sp)
+; ZVFHMIN64-NEXT:    sb s2, 140(sp)
+; ZVFHMIN64-NEXT:    sb a6, 141(sp)
+; ZVFHMIN64-NEXT:    sb a5, 142(sp)
+; ZVFHMIN64-NEXT:    sb a0, 143(sp)
+; ZVFHMIN64-NEXT:    sb ra, 136(sp)
+; ZVFHMIN64-NEXT:    sb s9, 137(sp)
+; ZVFHMIN64-NEXT:    sb s10, 138(sp)
+; ZVFHMIN64-NEXT:    sb s11, 139(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 638(sp)
-; ZVFHMIN64-NEXT:    lh a1, 382(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 382(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 255(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 636(sp)
-; ZVFHMIN64-NEXT:    lh a1, 380(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 380(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 254(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 634(sp)
-; ZVFHMIN64-NEXT:    lh a1, 378(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 378(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 253(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 632(sp)
-; ZVFHMIN64-NEXT:    lh a1, 376(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 376(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 252(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 630(sp)
-; ZVFHMIN64-NEXT:    lh a1, 374(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 374(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 251(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 628(sp)
-; ZVFHMIN64-NEXT:    lh a1, 372(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 372(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 626(sp)
-; ZVFHMIN64-NEXT:    lh a1, 370(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 249(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 624(sp)
-; ZVFHMIN64-NEXT:    lh a1, 368(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 248(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 622(sp)
-; ZVFHMIN64-NEXT:    lh a1, 366(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 366(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 247(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 620(sp)
-; ZVFHMIN64-NEXT:    lh a1, 364(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 364(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
 ; ZVFHMIN64-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 618(sp)
-; ZVFHMIN64-NEXT:    lh a1, 362(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 362(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
 ; ZVFHMIN64-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 616(sp)
-; ZVFHMIN64-NEXT:    lh a1, 360(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 360(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN64-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 614(sp)
-; ZVFHMIN64-NEXT:    lh a1, 358(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    lh a0, 358(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    ld a2, 120(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 612(sp)
-; ZVFHMIN64-NEXT:    lh a1, 356(sp)
-; ZVFHMIN64-NEXT:    sb a5, 204(sp)
-; ZVFHMIN64-NEXT:    sb a4, 205(sp)
-; ZVFHMIN64-NEXT:    sb a2, 206(sp)
-; ZVFHMIN64-NEXT:    sb a3, 207(sp)
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 200(sp)
-; ZVFHMIN64-NEXT:    sb a6, 201(sp)
-; ZVFHMIN64-NEXT:    sb a7, 202(sp)
-; ZVFHMIN64-NEXT:    sb t0, 203(sp)
-; ZVFHMIN64-NEXT:    li a2, 128
+; ZVFHMIN64-NEXT:    lh a2, 356(sp)
+; ZVFHMIN64-NEXT:    sb s6, 204(sp)
+; ZVFHMIN64-NEXT:    sb s8, 205(sp)
+; ZVFHMIN64-NEXT:    sb s7, 206(sp)
+; ZVFHMIN64-NEXT:    sb s3, 207(sp)
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 200(sp)
+; ZVFHMIN64-NEXT:    sb a1, 201(sp)
+; ZVFHMIN64-NEXT:    sb a4, 202(sp)
+; ZVFHMIN64-NEXT:    sb s4, 203(sp)
+; ZVFHMIN64-NEXT:    li a1, 128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 242(sp)
 ; ZVFHMIN64-NEXT:    addi a0, sp, 128
-; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN64-NEXT:    addi sp, s0, -896
-; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 896
-; ZVFHMIN64-NEXT:    ld ra, 888(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s0, 880(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s2, 872(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s3, 864(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s4, 856(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s5, 848(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s6, 840(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s7, 832(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s8, 824(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s9, 816(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s10, 808(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s11, 800(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    addi sp, s0, -1024
+; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 1024
+; ZVFHMIN64-NEXT:    ld ra, 1016(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s0, 1008(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s2, 1000(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s3, 992(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s4, 984(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s5, 976(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s6, 968(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s7, 960(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s8, 952(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s9, 944(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s10, 936(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s11, 928(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs0, 920(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs1, 912(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs2, 904(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs3, 896(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs4, 888(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs5, 880(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs6, 872(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs7, 864(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs8, 856(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs9, 848(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs10, 840(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fld fs11, 832(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    .cfi_restore ra
 ; ZVFHMIN64-NEXT:    .cfi_restore s0
 ; ZVFHMIN64-NEXT:    .cfi_restore s2
@@ -3376,7 +3570,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    .cfi_restore s9
 ; ZVFHMIN64-NEXT:    .cfi_restore s10
 ; ZVFHMIN64-NEXT:    .cfi_restore s11
-; ZVFHMIN64-NEXT:    addi sp, sp, 896
+; ZVFHMIN64-NEXT:    .cfi_restore fs0
+; ZVFHMIN64-NEXT:    .cfi_restore fs1
+; ZVFHMIN64-NEXT:    .cfi_restore fs2
+; ZVFHMIN64-NEXT:    .cfi_restore fs3
+; ZVFHMIN64-NEXT:    .cfi_restore fs4
+; ZVFHMIN64-NEXT:    .cfi_restore fs5
+; ZVFHMIN64-NEXT:    .cfi_restore fs6
+; ZVFHMIN64-NEXT:    .cfi_restore fs7
+; ZVFHMIN64-NEXT:    .cfi_restore fs8
+; ZVFHMIN64-NEXT:    .cfi_restore fs9
+; ZVFHMIN64-NEXT:    .cfi_restore fs10
+; ZVFHMIN64-NEXT:    .cfi_restore fs11
+; ZVFHMIN64-NEXT:    addi sp, sp, 1024
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN64-NEXT:    ret
   %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl)
@@ -3953,20 +4159,20 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB87_2
+; CHECK-NEXT:    bltu a2, a3, .LBB87_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB87_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
@@ -3977,13 +4183,13 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v7, v8, 2
+; CHECK-NEXT:    vslideup.vi v7, v16, 2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index 69d6ffa9f300c..81b8b2d5a2c88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -592,55 +592,30 @@ declare <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8>, <256 x i8>, metadata, <256 x
 define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vv_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    addi a2, a3, -128
-; CHECK-NEXT:    vle8.v v8, (a4)
+; CHECK-NEXT:    vle8.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a2
-; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    bltu a3, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB51_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmv1r.v v8, v6
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> %va, <256 x i8> %vb, metadata !"eq", <256 x i1> %m, i32 %evl)
   ret <256 x i1> %v
@@ -652,12 +627,12 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB52_2
@@ -682,12 +657,12 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB53_2
@@ -1263,19 +1238,19 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    bltu a2, a3, .LBB99_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB99_2:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v7, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -32
 ; CHECK-NEXT:    sltu a1, a2, a0
@@ -1308,9 +1283,9 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_v64i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB100_2
 ; CHECK-NEXT:  # %bb.1:
@@ -1338,9 +1313,9 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
 define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vx_swap_v64i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB101_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index d1980ee3b0a6f..26477edb33adc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -151,9 +151,9 @@ declare <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32>, <32 x i1>, i32)
 define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsext_v32i64_v32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index abbbfe8f252fb..6099b51841a20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -31,8 +31,8 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v9, v9, a2
-; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV32-NEXT:    vand.vi v8, v8, 1
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
@@ -65,8 +65,8 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v9, v9, a2
-; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
+; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64-NEXT:    vand.vi v8, v8, 1
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
@@ -80,13 +80,13 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsrl.vi v10, v10, 1
 ; CHECK-NEXT:    vrsub.vi v11, v10, 3
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 1, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -156,15 +156,15 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    lw a0, 36(sp)
-; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vmv.x.s a0, v16
+; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    lw a1, 120(sp)
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    lw a0, 36(sp)
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    lw a0, 120(sp)
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    addi sp, s0, -256
 ; RV32-NEXT:    .cfi_def_cfa sp, 256
 ; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -194,15 +194,15 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV64-NEXT:    vse32.v v8, (a1)
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    lw a0, 36(sp)
-; RV64-NEXT:    vmv.x.s a1, v16
+; RV64-NEXT:    vmv.x.s a0, v16
+; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v9, a1
-; RV64-NEXT:    lw a1, 120(sp)
-; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    lw a0, 36(sp)
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    lw a0, 120(sp)
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    .cfi_def_cfa sp, 256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
@@ -219,13 +219,13 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 define <16 x i1> @v16i1_v8i1(<8 x i1>) {
 ; CHECK-LABEL: v16i1_v8i1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_0)
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vrgather.vv v10, v9, v8
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vrgather.vv v10, v8, v9
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    ret
   %2 = shufflevector <8 x i1> %0, <8 x i1> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 5, i32 1, i32 2, i32 0, i32 6, i32 2, i32 3, i32 0, i32 7, i32 1, i32 2, i32 0, i32 4>
@@ -257,11 +257,11 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) {
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 4
 ; CHECK-NEXT:    addi a1, a1, 548
+; CHECK-NEXT:    addi a0, a0, -1856
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    addi a0, a0, -1856
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 0, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index f2353e7d028bd..5c2d61138df13 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -185,9 +185,9 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ; VLA-NEXT:    vmv2r.v v20, v14
 ; VLA-NEXT:    vmv2r.v v16, v12
 ; VLA-NEXT:    vmv2r.v v12, v10
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret
@@ -212,7 +212,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ; VLA-NEXT:    vmv1r.v v22, v11
 ; VLA-NEXT:    vmv1r.v v12, v10
 ; VLA-NEXT:    vmv1r.v v10, v9
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vslideup.vi v20, v18, 4
 ; VLA-NEXT:    vslideup.vi v16, v14, 4
 ; VLA-NEXT:    vslideup.vi v12, v22, 4
@@ -220,6 +219,7 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 10dadbc022e02..140d1450e1e5c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -11,16 +11,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    li a0, 73
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
 ; CHECK-NEXT:    vse8.v v10, (a1)
 ; CHECK-NEXT:    ret
@@ -36,16 +35,15 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    li a0, 146
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
 ; CHECK-NEXT:    vse8.v v10, (a1)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index c0c17d4e0623e..0b7a50912b447 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -186,9 +186,9 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vsrl.vi v10, v10, 1
 ; CHECK-NEXT:    vadd.vi v10, v10, 1
@@ -210,14 +210,14 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v12, v8
-; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v13
-; CHECK-NEXT:    vadd.vv v13, v13, v13
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vrsub.vi v9, v9, 4
 ; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vrsub.vi v13, v13, 4
-; CHECK-NEXT:    vrgather.vv v9, v12, v13, v0.t
+; CHECK-NEXT:    vrgather.vv v13, v8, v9, v0.t
+; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
@@ -255,11 +255,10 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 1
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; RV64-NEXT:    vmv.v.i v0, 1
 ; RV64-NEXT:    vrgather.vi v18, v15, 1, v0.t
 ; RV64-NEXT:    mv s2, sp
 ; RV64-NEXT:    vs8r.v v16, (s2)
@@ -291,9 +290,9 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmv1r.v v13, v10
 ; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vrgather.vi v12, v9, 0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    vmv.v.i v0, 1
-; CHECK-NEXT:    vrgather.vi v12, v9, 0
 ; CHECK-NEXT:    vmv1r.v v9, v11
 ; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -326,8 +325,8 @@ define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_ran
 ; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v12, v10, 0
+; CHECK-NEXT:    vmv.v.i v0, 8
 ; CHECK-NEXT:    vrgather.vi v12, v11, 0, v0.t
 ; CHECK-NEXT:    vrgather.vi v14, v8, 2
 ; CHECK-NEXT:    vrgather.vi v15, v10, 3
@@ -348,16 +347,18 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) {
 ; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vand.vx v10, v10, a1
 ; RV32-NEXT:    vsrl.vv v12, v8, v12
 ; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetivli zero, 16, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV32-NEXT:    vrgather.vi v10, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -373,12 +374,12 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) {
 ; RV64-NEXT:    vsetivli zero, 16, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vx v10, v8, a0
 ; RV64-NEXT:    vsll.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addi a0, a0, -241
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    addi a0, a0, -241
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
 ; RV64-NEXT:    vrgather.vi v10, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -437,11 +438,9 @@ define void @shuffle_3_input_vectors() vscale_range(4,4) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 1
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.v.i v0, 6
 ; CHECK-NEXT:    vslidedown.vi v20, v8, 1, v0.t
 ; CHECK-NEXT:    vslideup.vi v20, v9, 3
 ; CHECK-NEXT:    vslidedown.vi v21, v9, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index c222626a166fe..eb0ee5773962b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -1161,8 +1161,8 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v9, v12
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    vrgather.vv v8, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrgather.vv v8, v9, v12
 ; CHECK-NEXT:    vmv.v.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 3c28e978842b9..72a62627755dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -179,9 +179,9 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vmv.v.i v0, 1
 ; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; ZVE32F-NEXT:    vmv.v.i v0, 1
 ; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
 ; ZVE32F-NEXT:    vse32.v v9, (a1)
 ; ZVE32F-NEXT:    ret
@@ -233,9 +233,9 @@ define void @vnsrl_32_float(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vmv.v.i v0, 1
 ; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; ZVE32F-NEXT:    vmv.v.i v0, 1
 ; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
 ; ZVE32F-NEXT:    vse32.v v9, (a1)
 ; ZVE32F-NEXT:    ret
@@ -276,9 +276,9 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vmv.v.i v0, 1
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; V-NEXT:    vslidedown.vi v9, v8, 2
+; V-NEXT:    vmv.v.i v0, 1
 ; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
 ; V-NEXT:    vse64.v v9, (a1)
 ; V-NEXT:    ret
@@ -327,9 +327,9 @@ define void @vnsrl_64_double(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vmv.v.i v0, 1
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; V-NEXT:    vslidedown.vi v9, v8, 2
+; V-NEXT:    vmv.v.i v0, 1
 ; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
 ; V-NEXT:    vse64.v v9, (a1)
 ; V-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index a2d41de5d1853..ba3b994de46f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -390,9 +390,9 @@ declare <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64>, <32 x i1>, i32)
 define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsitofp_v32f64_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll
index 391117c72ece7..3a3d417868dfe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll
@@ -14,8 +14,8 @@ define void @baz() nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(foo)
 ; CHECK-NEXT:    addi a1, a0, %lo(foo)
-; CHECK-NEXT:    lw a1, 4(a1)
 ; CHECK-NEXT:    lw a0, %lo(foo)(a0)
+; CHECK-NEXT:    lw a1, 4(a1)
 ; CHECK-NEXT:    lui a2, %hi(bar)
 ; CHECK-NEXT:    sw a1, %lo(bar)(a2)
 ; CHECK-NEXT:    addi a1, a2, %lo(bar)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 29d9a8a9b060c..0510cac7ffd0e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -638,10 +638,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB12_1: # %bb2
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
-; V-NEXT:    vlse64.v v8, (a1), a3
-; V-NEXT:    addi a4, a1, 80
-; V-NEXT:    vlse64.v v9, (a4), a3
 ; V-NEXT:    addi a4, a0, 16
+; V-NEXT:    addi a5, a1, 80
+; V-NEXT:    vlse64.v v8, (a1), a3
+; V-NEXT:    vlse64.v v9, (a5), a3
 ; V-NEXT:    vse64.v v8, (a0)
 ; V-NEXT:    addi a0, a0, 32
 ; V-NEXT:    vse64.v v9, (a4)
@@ -662,6 +662,7 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    mul a6, a3, a5
 ; ZVE32F-NEXT:    mul a7, a2, a5
 ; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    add a6, a1, a6
 ; ZVE32F-NEXT:    add a7, a1, a7
 ; ZVE32F-NEXT:    ld t0, 0(a7)
@@ -673,7 +674,6 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    sd a7, 16(a0)
 ; ZVE32F-NEXT:    sd a6, 24(a0)
 ; ZVE32F-NEXT:    addi a0, a0, 32
-; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    bne a0, a4, .LBB12_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
@@ -686,10 +686,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; OPTZVE32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; OPTZVE32F-NEXT:  .LBB12_1: # %bb2
 ; OPTZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; OPTZVE32F-NEXT:    vlse64.v v8, (a1), a3
-; OPTZVE32F-NEXT:    addi a4, a1, 80
-; OPTZVE32F-NEXT:    vlse64.v v9, (a4), a3
 ; OPTZVE32F-NEXT:    addi a4, a0, 16
+; OPTZVE32F-NEXT:    addi a5, a1, 80
+; OPTZVE32F-NEXT:    vlse64.v v8, (a1), a3
+; OPTZVE32F-NEXT:    vlse64.v v9, (a5), a3
 ; OPTZVE32F-NEXT:    vse64.v v8, (a0)
 ; OPTZVE32F-NEXT:    addi a0, a0, 32
 ; OPTZVE32F-NEXT:    vse64.v v9, (a4)
@@ -710,6 +710,7 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; OPTV-NEXT:    mul a6, a3, a5
 ; OPTV-NEXT:    mul a7, a2, a5
 ; OPTV-NEXT:    addi a2, a2, 4
+; OPTV-NEXT:    addi a3, a3, 4
 ; OPTV-NEXT:    add a6, a1, a6
 ; OPTV-NEXT:    add a7, a1, a7
 ; OPTV-NEXT:    ld t0, 0(a7)
@@ -721,7 +722,6 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; OPTV-NEXT:    sd a7, 16(a0)
 ; OPTV-NEXT:    sd a6, 24(a0)
 ; OPTV-NEXT:    addi a0, a0, 32
-; OPTV-NEXT:    addi a3, a3, 4
 ; OPTV-NEXT:    bne a0, a4, .LBB12_1
 ; OPTV-NEXT:  # %bb.2: # %bb18
 ; OPTV-NEXT:    ret
@@ -791,14 +791,14 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    addi a2, a2, 4
-; ZVE32F-NEXT:    addi a1, a1, 32
+; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    add t3, a0, t3
 ; ZVE32F-NEXT:    sd a6, 0(t3)
 ; ZVE32F-NEXT:    sd a7, 0(t2)
 ; ZVE32F-NEXT:    sd t0, 80(t3)
 ; ZVE32F-NEXT:    sd t1, 80(t2)
-; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    addi a1, a1, 32
 ; ZVE32F-NEXT:    bne a1, a4, .LBB13_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
@@ -839,14 +839,14 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; OPTV-NEXT:    mul t2, a3, a5
 ; OPTV-NEXT:    mul t3, a2, a5
 ; OPTV-NEXT:    addi a2, a2, 4
-; OPTV-NEXT:    addi a1, a1, 32
+; OPTV-NEXT:    addi a3, a3, 4
 ; OPTV-NEXT:    add t2, a0, t2
 ; OPTV-NEXT:    add t3, a0, t3
 ; OPTV-NEXT:    sd a6, 0(t3)
 ; OPTV-NEXT:    sd a7, 0(t2)
 ; OPTV-NEXT:    sd t0, 80(t3)
 ; OPTV-NEXT:    sd t1, 80(t2)
-; OPTV-NEXT:    addi a3, a3, 4
+; OPTV-NEXT:    addi a1, a1, 32
 ; OPTV-NEXT:    bne a1, a4, .LBB13_1
 ; OPTV-NEXT:  # %bb.2: # %bb18
 ; OPTV-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 4b7f82f94f5e4..fe86344ec73fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -609,11 +609,11 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:  .LBB47_2:
 ; CHECK-RV32-NEXT:    mul a6, a3, a2
 ; CHECK-RV32-NEXT:    addi a5, a4, -32
+; CHECK-RV32-NEXT:    add a6, a1, a6
 ; CHECK-RV32-NEXT:    sltu a7, a4, a5
 ; CHECK-RV32-NEXT:    addi a7, a7, -1
 ; CHECK-RV32-NEXT:    and a7, a7, a5
 ; CHECK-RV32-NEXT:    li a5, 16
-; CHECK-RV32-NEXT:    add a6, a1, a6
 ; CHECK-RV32-NEXT:    bltu a7, a5, .LBB47_4
 ; CHECK-RV32-NEXT:  # %bb.3:
 ; CHECK-RV32-NEXT:    li a7, 16
@@ -636,16 +636,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    add a5, a1, a5
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV32-NEXT:    addi a3, a0, 128
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a1), a2, v0.t
-; CHECK-RV32-NEXT:    addi a1, a0, 128
-; CHECK-RV32-NEXT:    addi a2, a0, 256
+; CHECK-RV32-NEXT:    addi a1, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vse64.v v8, (a0)
-; CHECK-RV32-NEXT:    vse64.v v24, (a1)
+; CHECK-RV32-NEXT:    vse64.v v24, (a3)
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vse64.v v16, (a2)
+; CHECK-RV32-NEXT:    vse64.v v16, (a1)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: strided_load_v33f64:
@@ -660,11 +660,11 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:  .LBB47_2:
 ; CHECK-RV64-NEXT:    mul a6, a4, a2
 ; CHECK-RV64-NEXT:    addi a5, a3, -32
+; CHECK-RV64-NEXT:    add a6, a1, a6
 ; CHECK-RV64-NEXT:    sltu a7, a3, a5
 ; CHECK-RV64-NEXT:    addi a7, a7, -1
 ; CHECK-RV64-NEXT:    and a7, a7, a5
 ; CHECK-RV64-NEXT:    li a5, 16
-; CHECK-RV64-NEXT:    add a6, a1, a6
 ; CHECK-RV64-NEXT:    bltu a7, a5, .LBB47_4
 ; CHECK-RV64-NEXT:  # %bb.3:
 ; CHECK-RV64-NEXT:    li a7, 16
@@ -687,16 +687,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    add a5, a1, a5
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV64-NEXT:    addi a4, a0, 128
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v8, (a1), a2, v0.t
-; CHECK-RV64-NEXT:    addi a1, a0, 128
-; CHECK-RV64-NEXT:    addi a2, a0, 256
+; CHECK-RV64-NEXT:    addi a1, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vse64.v v8, (a0)
-; CHECK-RV64-NEXT:    vse64.v v24, (a1)
+; CHECK-RV64-NEXT:    vse64.v v24, (a4)
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vse64.v v16, (a2)
+; CHECK-RV64-NEXT:    vse64.v v16, (a1)
 ; CHECK-RV64-NEXT:    ret
   %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
   ret <33 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 7ca329835b7ac..733c850d64011 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -472,9 +472,9 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
 ; CHECK-NEXT:    addi a3, a2, -16
 ; CHECK-NEXT:    sltu a2, a2, a3
 ; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index a91dee1cb245f..dd5630e165f19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -55,8 +55,8 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vslidedown.vi v12, v0, 8
+; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    mv a2, a0
 ; CHECK-NEXT:    bltu a0, a1, .LBB4_2
 ; CHECK-NEXT:  # %bb.1:
@@ -245,64 +245,64 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vslidedown.vi v6, v0, 8
-; CHECK-NEXT:    addi a2, a1, 512
-; CHECK-NEXT:    addi a3, a1, 640
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v26, v0, 4
+; CHECK-NEXT:    addi a3, a1, 128
+; CHECK-NEXT:    addi a2, a1, 640
 ; CHECK-NEXT:    addi a4, a7, -64
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    sltu a2, a7, a4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a4, a2, a4
+; CHECK-NEXT:    addi a2, a4, -32
+; CHECK-NEXT:    sltu a5, a4, a2
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a2
+; CHECK-NEXT:    addi a2, a5, -16
+; CHECK-NEXT:    sltu a6, a5, a2
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a2, a6, a2
+; CHECK-NEXT:    addi a6, a1, 512
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v27, v6, 4
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a7, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v27, 2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a4, a3, a4
-; CHECK-NEXT:    addi a3, a4, -32
-; CHECK-NEXT:    sltu a5, a4, a3
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a3, a5, a3
-; CHECK-NEXT:    addi a5, a3, -16
-; CHECK-NEXT:    sltu a6, a3, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a2)
-; CHECK-NEXT:    addi a5, a1, 128
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v26, v7, 4
-; CHECK-NEXT:    bltu a3, a2, .LBB16_2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a6)
+; CHECK-NEXT:    bltu a5, a2, .LBB16_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    li a5, 16
 ; CHECK-NEXT:  .LBB16_2:
 ; CHECK-NEXT:    vmv1r.v v0, v27
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v27, v26, 2
-; CHECK-NEXT:    li a5, 64
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a3)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    mul a3, a3, a6
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 6
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a3, 64
 ; CHECK-NEXT:    mv a6, a7
-; CHECK-NEXT:    bltu a7, a5, .LBB16_4
+; CHECK-NEXT:    bltu a7, a3, .LBB16_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    li a6, 64
 ; CHECK-NEXT:  .LBB16_4:
@@ -343,13 +343,13 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    li a6, 16
 ; CHECK-NEXT:  .LBB16_6:
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    addi a1, a1, 256
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v26, v6, 2
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a5)
 ; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a1, 256
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v26, v6, 2
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    li t0, 48
 ; CHECK-NEXT:    mul a5, a5, t0
@@ -369,13 +369,13 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  # %bb.7:
 ; CHECK-NEXT:    li a5, 32
 ; CHECK-NEXT:  .LBB16_8:
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a5, -16
 ; CHECK-NEXT:    sltu a5, a5, a1
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a1, a5, a1
-; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -543,8 +543,8 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vslidedown.vi v12, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index a0d5d2ccc848d..32aeb6300d17d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -390,9 +390,9 @@ declare <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64>, <32 x i1>, i32)
 define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vuitofp_v32f64_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 6d9f69f436fc4..8e7f6666fb4ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -84,10 +84,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %
 ; RV32-SLOW-NEXT:  # %bb.1: # %cond.load
 ; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-SLOW-NEXT:    vmv.x.s a1, v8
-; RV32-SLOW-NEXT:    lbu a2, 1(a1)
-; RV32-SLOW-NEXT:    lbu a1, 0(a1)
-; RV32-SLOW-NEXT:    slli a2, a2, 8
-; RV32-SLOW-NEXT:    or a1, a2, a1
+; RV32-SLOW-NEXT:    lbu a2, 0(a1)
+; RV32-SLOW-NEXT:    lbu a1, 1(a1)
+; RV32-SLOW-NEXT:    slli a1, a1, 8
+; RV32-SLOW-NEXT:    or a1, a1, a2
 ; RV32-SLOW-NEXT:    vsetvli zero, zero, e16, m2, tu, ma
 ; RV32-SLOW-NEXT:    vmv.s.x v9, a1
 ; RV32-SLOW-NEXT:  .LBB4_2: # %else
@@ -97,10 +97,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %
 ; RV32-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-SLOW-NEXT:    vmv.x.s a0, v8
-; RV32-SLOW-NEXT:    lbu a1, 1(a0)
-; RV32-SLOW-NEXT:    lbu a0, 0(a0)
-; RV32-SLOW-NEXT:    slli a1, a1, 8
-; RV32-SLOW-NEXT:    or a0, a1, a0
+; RV32-SLOW-NEXT:    lbu a1, 0(a0)
+; RV32-SLOW-NEXT:    lbu a0, 1(a0)
+; RV32-SLOW-NEXT:    slli a0, a0, 8
+; RV32-SLOW-NEXT:    or a0, a0, a1
 ; RV32-SLOW-NEXT:    vmv.s.x v8, a0
 ; RV32-SLOW-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; RV32-SLOW-NEXT:    vslideup.vi v9, v8, 1
@@ -118,10 +118,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %
 ; RV64-SLOW-NEXT:  # %bb.1: # %cond.load
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a1, v8
-; RV64-SLOW-NEXT:    lbu a2, 1(a1)
-; RV64-SLOW-NEXT:    lbu a1, 0(a1)
-; RV64-SLOW-NEXT:    slli a2, a2, 8
-; RV64-SLOW-NEXT:    or a1, a2, a1
+; RV64-SLOW-NEXT:    lbu a2, 0(a1)
+; RV64-SLOW-NEXT:    lbu a1, 1(a1)
+; RV64-SLOW-NEXT:    slli a1, a1, 8
+; RV64-SLOW-NEXT:    or a1, a1, a2
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e16, m2, tu, ma
 ; RV64-SLOW-NEXT:    vmv.s.x v9, a1
 ; RV64-SLOW-NEXT:  .LBB4_2: # %else
@@ -131,10 +131,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
-; RV64-SLOW-NEXT:    lbu a1, 1(a0)
-; RV64-SLOW-NEXT:    lbu a0, 0(a0)
-; RV64-SLOW-NEXT:    slli a1, a1, 8
-; RV64-SLOW-NEXT:    or a0, a1, a0
+; RV64-SLOW-NEXT:    lbu a1, 0(a0)
+; RV64-SLOW-NEXT:    lbu a0, 1(a0)
+; RV64-SLOW-NEXT:    slli a0, a0, 8
+; RV64-SLOW-NEXT:    or a0, a0, a1
 ; RV64-SLOW-NEXT:    vmv.s.x v8, a0
 ; RV64-SLOW-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; RV64-SLOW-NEXT:    vslideup.vi v9, v8, 1
@@ -204,10 +204,10 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
 ; RV64-SLOW-NEXT:  # %bb.1: # %cond.load
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a1, v8
-; RV64-SLOW-NEXT:    lwu a2, 4(a1)
-; RV64-SLOW-NEXT:    lwu a1, 0(a1)
-; RV64-SLOW-NEXT:    slli a2, a2, 32
-; RV64-SLOW-NEXT:    or a1, a2, a1
+; RV64-SLOW-NEXT:    lwu a2, 0(a1)
+; RV64-SLOW-NEXT:    lwu a1, 4(a1)
+; RV64-SLOW-NEXT:    slli a1, a1, 32
+; RV64-SLOW-NEXT:    or a1, a1, a2
 ; RV64-SLOW-NEXT:    vmv.s.x v9, a1
 ; RV64-SLOW-NEXT:  .LBB5_2: # %else
 ; RV64-SLOW-NEXT:    andi a0, a0, 2
@@ -216,10 +216,10 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
 ; RV64-SLOW-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
-; RV64-SLOW-NEXT:    lwu a1, 4(a0)
-; RV64-SLOW-NEXT:    lwu a0, 0(a0)
-; RV64-SLOW-NEXT:    slli a1, a1, 32
-; RV64-SLOW-NEXT:    or a0, a1, a0
+; RV64-SLOW-NEXT:    lwu a1, 0(a0)
+; RV64-SLOW-NEXT:    lwu a0, 4(a0)
+; RV64-SLOW-NEXT:    slli a0, a0, 32
+; RV64-SLOW-NEXT:    or a0, a0, a1
 ; RV64-SLOW-NEXT:    vmv.s.x v8, a0
 ; RV64-SLOW-NEXT:    vslideup.vi v9, v8, 1
 ; RV64-SLOW-NEXT:  .LBB5_4: # %else2
@@ -489,12 +489,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-SLOW-NEXT:    # implicit-def: $v8
 ; RV32-SLOW-NEXT:    beqz a3, .LBB8_2
 ; RV32-SLOW-NEXT:  # %bb.1: # %cond.load
-; RV32-SLOW-NEXT:    lbu a3, 1(a0)
-; RV32-SLOW-NEXT:    lbu a4, 0(a0)
+; RV32-SLOW-NEXT:    lbu a3, 0(a0)
+; RV32-SLOW-NEXT:    lbu a4, 1(a0)
 ; RV32-SLOW-NEXT:    lbu a5, 2(a0)
 ; RV32-SLOW-NEXT:    lbu a6, 3(a0)
-; RV32-SLOW-NEXT:    slli a3, a3, 8
-; RV32-SLOW-NEXT:    or a3, a3, a4
+; RV32-SLOW-NEXT:    slli a4, a4, 8
+; RV32-SLOW-NEXT:    or a3, a4, a3
 ; RV32-SLOW-NEXT:    slli a5, a5, 16
 ; RV32-SLOW-NEXT:    slli a6, a6, 24
 ; RV32-SLOW-NEXT:    or a4, a6, a5
@@ -505,12 +505,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-SLOW-NEXT:    andi a2, a2, 2
 ; RV32-SLOW-NEXT:    beqz a2, .LBB8_4
 ; RV32-SLOW-NEXT:  # %bb.3: # %cond.load1
-; RV32-SLOW-NEXT:    lbu a2, 5(a0)
-; RV32-SLOW-NEXT:    lbu a3, 4(a0)
+; RV32-SLOW-NEXT:    lbu a2, 4(a0)
+; RV32-SLOW-NEXT:    lbu a3, 5(a0)
 ; RV32-SLOW-NEXT:    lbu a4, 6(a0)
 ; RV32-SLOW-NEXT:    lbu a0, 7(a0)
-; RV32-SLOW-NEXT:    slli a2, a2, 8
-; RV32-SLOW-NEXT:    or a2, a2, a3
+; RV32-SLOW-NEXT:    slli a3, a3, 8
+; RV32-SLOW-NEXT:    or a2, a3, a2
 ; RV32-SLOW-NEXT:    slli a4, a4, 16
 ; RV32-SLOW-NEXT:    slli a0, a0, 24
 ; RV32-SLOW-NEXT:    or a0, a0, a4
@@ -533,12 +533,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-SLOW-NEXT:    # implicit-def: $v8
 ; RV64-SLOW-NEXT:    beqz a3, .LBB8_2
 ; RV64-SLOW-NEXT:  # %bb.1: # %cond.load
-; RV64-SLOW-NEXT:    lbu a3, 1(a0)
-; RV64-SLOW-NEXT:    lbu a4, 0(a0)
+; RV64-SLOW-NEXT:    lbu a3, 0(a0)
+; RV64-SLOW-NEXT:    lbu a4, 1(a0)
 ; RV64-SLOW-NEXT:    lbu a5, 2(a0)
 ; RV64-SLOW-NEXT:    lb a6, 3(a0)
-; RV64-SLOW-NEXT:    slli a3, a3, 8
-; RV64-SLOW-NEXT:    or a3, a3, a4
+; RV64-SLOW-NEXT:    slli a4, a4, 8
+; RV64-SLOW-NEXT:    or a3, a4, a3
 ; RV64-SLOW-NEXT:    slli a5, a5, 16
 ; RV64-SLOW-NEXT:    slli a6, a6, 24
 ; RV64-SLOW-NEXT:    or a4, a6, a5
@@ -549,12 +549,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-SLOW-NEXT:    andi a2, a2, 2
 ; RV64-SLOW-NEXT:    beqz a2, .LBB8_4
 ; RV64-SLOW-NEXT:  # %bb.3: # %cond.load1
-; RV64-SLOW-NEXT:    lbu a2, 5(a0)
-; RV64-SLOW-NEXT:    lbu a3, 4(a0)
+; RV64-SLOW-NEXT:    lbu a2, 4(a0)
+; RV64-SLOW-NEXT:    lbu a3, 5(a0)
 ; RV64-SLOW-NEXT:    lbu a4, 6(a0)
 ; RV64-SLOW-NEXT:    lb a0, 7(a0)
-; RV64-SLOW-NEXT:    slli a2, a2, 8
-; RV64-SLOW-NEXT:    or a2, a2, a3
+; RV64-SLOW-NEXT:    slli a3, a3, 8
+; RV64-SLOW-NEXT:    or a2, a3, a2
 ; RV64-SLOW-NEXT:    slli a4, a4, 16
 ; RV64-SLOW-NEXT:    slli a0, a0, 24
 ; RV64-SLOW-NEXT:    or a0, a0, a4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 7ee8179acfdb9..e56b7c75c41d1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -366,12 +366,12 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -1357,9 +1357,9 @@ declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vadd_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB108_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index fa82065f3b413..9678fa87dc9b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -298,46 +298,36 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB26_2
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    bltu a2, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 08f486b601328..990cf03a2e9b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -363,9 +363,9 @@ declare <32 x double> @llvm.vp.fabs.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfabs_vv_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index bde842dcc7600..a6c51ced93ddc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -849,35 +849,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v8, (a2)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a4
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    addi a5, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    bltu a4, a1, .LBB50_2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a4
+; CHECK-NEXT:    vle64.v v8, (a3)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a5)
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    bltu a4, a2, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
@@ -893,16 +893,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
@@ -941,26 +941,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    addi a3, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    mv a1, a4
+; CHECK-NEXT:    vle64.v v24, (a3)
 ; CHECK-NEXT:    vle64.v v0, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a4
-; CHECK-NEXT:    bltu a4, a1, .LBB51_2
+; CHECK-NEXT:    bltu a4, a2, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index b37c47a32ba21..13c8077a84c56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB26_2
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    bltu a2, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 261523e8ace50..fd43b8bbaf185 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB26_2
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    bltu a2, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a5d9b3439e29b..eb4ce757a8385 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -621,35 +621,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v8, (a2)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a4
+; CHECK-NEXT:    addi a3, a2, 128
+; CHECK-NEXT:    addi a5, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    bltu a4, a1, .LBB50_2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    mv a1, a4
+; CHECK-NEXT:    vle64.v v8, (a3)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a5)
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    bltu a4, a2, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
@@ -665,16 +665,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
@@ -713,26 +713,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    addi a3, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    mv a1, a4
+; CHECK-NEXT:    vle64.v v24, (a3)
 ; CHECK-NEXT:    vle64.v v0, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a4
-; CHECK-NEXT:    bltu a4, a1, .LBB51_2
+; CHECK-NEXT:    bltu a4, a2, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 968fd9f9bab80..a3853d19c3ef9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -355,9 +355,9 @@ declare <32 x double> @llvm.vp.fneg.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfneg_vv_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index 6244419de65b1..d87c1e332ce65 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -379,9 +379,9 @@ declare <32 x double> @llvm.vp.sqrt.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfsqrt_vv_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index fec54b36042fa..28ac46cd5fc88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -270,12 +270,12 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -1029,9 +1029,9 @@ declare <32 x i64> @llvm.vp.smax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB74_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 7ca0dbd9adffc..b7555f4b3588b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -269,12 +269,12 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -1028,9 +1028,9 @@ declare <32 x i64> @llvm.vp.umax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmaxu_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB74_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index ea75742ca6e43..bd49b9876575e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -270,12 +270,12 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -1029,9 +1029,9 @@ declare <32 x i64> @llvm.vp.smin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB74_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index f4f54db64018d..f6e5fd42f07ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -269,12 +269,12 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a3, 128
+; CHECK-NEXT:    addi a4, a2, -128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltu a1, a2, a4
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -1028,9 +1028,9 @@ declare <32 x i64> @llvm.vp.umin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
 define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vminu_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB74_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 1f6513ae09d60..36cc8dd25bf94 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -2052,13 +2052,13 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
 ; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v24
@@ -2077,9 +2077,9 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV64-NEXT:    addi a1, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2093,8 +2093,8 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf4 v16, v8
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsll.vi v16, v16, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    bltu a1, a3, .LBB95_2
 ; RV32-NEXT:  # %bb.1:
@@ -2103,13 +2103,13 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2119,11 +2119,11 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v24, v8
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vsext.vf8 v24, v10
+; RV64-NEXT:    vsll.vi v8, v16, 3
+; RV64-NEXT:    vsll.vi v16, v24, 3
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB95_2
 ; RV64-NEXT:  # %bb.1:
@@ -2134,9 +2134,9 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2151,8 +2151,8 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf4 v16, v8
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsll.vi v16, v16, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    bltu a1, a3, .LBB96_2
 ; RV32-NEXT:  # %bb.1:
@@ -2161,13 +2161,13 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2175,14 +2175,14 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-LABEL: vpgather_baseidx_sext_v32i8_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v24, v8
+; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsext.vf8 v24, v8
+; RV64-NEXT:    vsll.vi v8, v16, 3
+; RV64-NEXT:    vsll.vi v16, v24, 3
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB96_2
 ; RV64-NEXT:  # %bb.1:
@@ -2193,9 +2193,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2210,11 +2210,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    li a4, 16
 ; RV32-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; RV32-NEXT:    vwmulu.vx v16, v8, a3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a4, .LBB97_2
+; RV32-NEXT:    bltu a1, a3, .LBB97_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB97_2:
@@ -2225,9 +2225,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    addi a2, a1, -16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2236,11 +2236,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    li a3, 8
-; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; RV64-NEXT:    vwmulu.vx v16, v8, a3
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    bltu a1, a4, .LBB97_2
+; RV64-NEXT:    bltu a1, a3, .LBB97_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB97_2:
@@ -2251,9 +2251,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2268,11 +2268,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    li a4, 16
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vwmulsu.vx v16, v8, a3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a4, .LBB98_2
+; RV32-NEXT:    bltu a1, a3, .LBB98_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB98_2:
@@ -2283,9 +2283,9 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-NEXT:    addi a2, a1, -16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2295,11 +2295,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v24, v8
+; RV64-NEXT:    vsext.vf4 v16, v8
+; RV64-NEXT:    vsext.vf4 v24, v12
+; RV64-NEXT:    vsll.vi v8, v16, 3
+; RV64-NEXT:    vsll.vi v16, v24, 3
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsext.vf4 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB98_2
 ; RV64-NEXT:  # %bb.1:
@@ -2310,9 +2310,9 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2326,11 +2326,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    li a4, 16
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vwmulsu.vx v16, v8, a3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a4, .LBB99_2
+; RV32-NEXT:    bltu a1, a3, .LBB99_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB99_2:
@@ -2341,9 +2341,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    addi a2, a1, -16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2351,14 +2351,14 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-LABEL: vpgather_baseidx_sext_v32i16_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v24, v8
+; RV64-NEXT:    vsext.vf4 v16, v8
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsext.vf4 v24, v8
+; RV64-NEXT:    vsll.vi v8, v16, 3
+; RV64-NEXT:    vsll.vi v16, v24, 3
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB99_2
 ; RV64-NEXT:  # %bb.1:
@@ -2369,9 +2369,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2386,11 +2386,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    li a4, 16
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vwmulu.vx v16, v8, a3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a4, .LBB100_2
+; RV32-NEXT:    bltu a1, a3, .LBB100_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB100_2:
@@ -2401,9 +2401,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    addi a2, a1, -16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2412,11 +2412,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    li a3, 8
-; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV64-NEXT:    vwmulu.vx v16, v8, a3
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    bltu a1, a4, .LBB100_2
+; RV64-NEXT:    bltu a1, a3, .LBB100_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB100_2:
@@ -2427,9 +2427,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2443,9 +2443,9 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-LABEL: vpgather_baseidx_v32i32_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v16, v8, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    bltu a1, a3, .LBB101_2
 ; RV32-NEXT:  # %bb.1:
@@ -2454,13 +2454,13 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2470,10 +2470,10 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
 ; RV64-NEXT:    li a2, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v16, v24, a2
 ; RV64-NEXT:    vwmulsu.vx v24, v8, a2
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB101_2
 ; RV64-NEXT:  # %bb.1:
@@ -2484,9 +2484,9 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2499,9 +2499,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-LABEL: vpgather_baseidx_sext_v32i32_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v16, v8, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    bltu a1, a3, .LBB102_2
 ; RV32-NEXT:  # %bb.1:
@@ -2510,13 +2510,13 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2526,10 +2526,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
 ; RV64-NEXT:    li a2, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v16, v24, a2
 ; RV64-NEXT:    vwmulsu.vx v24, v8, a2
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB102_2
 ; RV64-NEXT:  # %bb.1:
@@ -2540,9 +2540,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2556,9 +2556,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-LABEL: vpgather_baseidx_zext_v32i32_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v16, v8, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    bltu a1, a3, .LBB103_2
 ; RV32-NEXT:  # %bb.1:
@@ -2567,13 +2567,13 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2583,10 +2583,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
 ; RV64-NEXT:    li a2, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulu.vx v16, v24, a2
 ; RV64-NEXT:    vwmulu.vx v24, v8, a2
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB103_2
 ; RV64-NEXT:  # %bb.1:
@@ -2597,9 +2597,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2618,16 +2618,16 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    vnsrl.wi v16, v8, 0
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    addi a3, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vslideup.vi v16, v24, 16
 ; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    sltu a2, a1, a3
 ; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
-; RV32-NEXT:    and a2, a2, a3
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    li a2, 16
@@ -2644,8 +2644,8 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v8, 3
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB104_2
 ; RV64-NEXT:  # %bb.1:
@@ -2656,9 +2656,9 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index 6c9989775f790..4f3179823f5b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -374,12 +374,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -403,12 +403,12 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    addi a4, a3, -16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    sltu a3, a3, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    addi a4, a1, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
 ; CHECK-NEXT:    addi a3, a2, -32
@@ -420,9 +420,9 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:  .LBB32_4:
+; CHECK-NEXT:    addi a5, a1, 256
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 4
-; CHECK-NEXT:    addi a5, a1, 256
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a5), v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB32_6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 6394542479d1b..c6e64fe2bd32e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1360,22 +1360,22 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB83_2
+; CHECK-NEXT:    bltu a2, a3, .LBB83_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB83_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
@@ -1406,9 +1406,9 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v16, v16, fa0, v0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index f7e4716d2c847..cf5650c0ab4ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1756,13 +1756,13 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    addi a0, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    sltu a1, a1, a0
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v24, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (zero), v8, v0.t
 ; RV32-NEXT:    ret
@@ -1778,23 +1778,23 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a1, a0, 128
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v16, (a1)
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    vle64.v v24, (a0)
-; RV64-NEXT:    li a1, 16
-; RV64-NEXT:    mv a0, a2
-; RV64-NEXT:    bltu a2, a1, .LBB83_2
+; RV64-NEXT:    bltu a2, a3, .LBB83_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 16
+; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB83_2:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    addi a0, a2, -16
 ; RV64-NEXT:    sltu a1, a2, a0
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1816,8 +1816,8 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a1)
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:    bltu a2, a3, .LBB84_2
 ; RV32-NEXT:  # %bb.1:
@@ -1826,13 +1826,13 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v24, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    ret
@@ -1854,14 +1854,14 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    li a1, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v24, 16
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v8, v16, a1
 ; RV64-NEXT:    vwmulsu.vx v16, v24, a1
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB84_2
 ; RV64-NEXT:  # %bb.1:
@@ -1870,20 +1870,20 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    addi a3, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v8, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
@@ -1902,8 +1902,8 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a1)
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:    bltu a2, a3, .LBB85_2
 ; RV32-NEXT:  # %bb.1:
@@ -1912,13 +1912,13 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v24, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    ret
@@ -1940,14 +1940,14 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    li a1, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v24, 16
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulsu.vx v8, v16, a1
 ; RV64-NEXT:    vwmulsu.vx v16, v24, a1
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB85_2
 ; RV64-NEXT:  # %bb.1:
@@ -1956,20 +1956,20 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi a3, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v8, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
@@ -1989,8 +1989,8 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a1)
-; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    li a3, 16
 ; RV32-NEXT:    mv a1, a2
 ; RV32-NEXT:    bltu a2, a3, .LBB86_2
 ; RV32-NEXT:  # %bb.1:
@@ -1999,13 +1999,13 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v24, 16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    ret
@@ -2027,14 +2027,14 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    li a1, 8
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v24, 16
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwmulu.vx v8, v16, a1
 ; RV64-NEXT:    vwmulu.vx v16, v24, a1
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB86_2
 ; RV64-NEXT:  # %bb.1:
@@ -2043,20 +2043,20 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi a3, sp, 16
 ; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v8, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index d30e8b46e6df2..d3a8e8548f5b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -304,12 +304,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index 7afd31fdd663c..8a15fa6929708 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -375,12 +375,12 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -1370,9 +1370,9 @@ declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i
 define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsadd_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB108_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index f61b112fd8024..0f2ff55d767d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -371,12 +371,12 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -1366,9 +1366,9 @@ declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i
 define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsaddu_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB108_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
index dc83edba5ae8c..c5506e175ce00 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
@@ -8,104 +8,48 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a4, 48
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a4, 40
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a4, a3, 128
-; CHECK-NEXT:    addi a5, a3, 384
+; CHECK-NEXT:    addi a4, a3, 256
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a5)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a2, a2, a5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, a1, 128
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a3, 256
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle8.v v8, (a2)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vle8.v v24, (a4)
+; CHECK-NEXT:    addi a2, a3, 384
+; CHECK-NEXT:    vle8.v v0, (a1)
+; CHECK-NEXT:    addi a1, a1, 128
+; CHECK-NEXT:    vadd.vv v8, v0, v24
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle8.v v0, (a2)
+; CHECK-NEXT:    vle8.v v24, (a1)
+; CHECK-NEXT:    vadd.vv v24, v24, v0
+; CHECK-NEXT:    addi a1, a3, 128
+; CHECK-NEXT:    vle8.v v0, (a1)
+; CHECK-NEXT:    vadd.vv v16, v16, v0
 ; CHECK-NEXT:    vle8.v v0, (a3)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v16, v16, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v24, v8, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vadd.vv v0, v8, v0
 ; CHECK-NEXT:    vse8.v v0, (a0)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vse8.v v16, (a1)
+; CHECK-NEXT:    vse8.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vse8.v v24, (a0)
+; CHECK-NEXT:    vse8.v v16, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -121,10 +65,10 @@ define <512 x i8> @vadd_v512i8_zvl256(<512 x i8> %a, <512 x i8> %b) #1 {
 ; CHECK-NEXT:    addi a1, a0, 256
 ; CHECK-NEXT:    li a2, 256
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    vle8.v v0, (a1)
-; CHECK-NEXT:    vadd.vv v8, v8, v24
-; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vle8.v v24, (a1)
+; CHECK-NEXT:    vle8.v v0, (a0)
+; CHECK-NEXT:    vadd.vv v8, v8, v0
+; CHECK-NEXT:    vadd.vv v16, v16, v24
 ; CHECK-NEXT:    ret
   %c = add <512 x i8> %a, %b
   ret <512 x i8> %c
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 05254e60b65b7..81c98d6881e72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -155,46 +155,30 @@ declare <256 x i8> @llvm.vp.select.v256i8(<256 x i1>, <256 x i8>, <256 x i8>, i3
 define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: select_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v6, v8
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v24, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a3, -128
-; CHECK-NEXT:    vle8.v v8, (a4)
+; CHECK-NEXT:    vle8.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a0
-; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
-; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    bltu a3, a2, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB11_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 %evl)
   ret <256 x i8> %v
@@ -203,58 +187,21 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c) {
 ; CHECK-LABEL: select_evl_v256i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, a1, 128
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 129)
   ret <256 x i8> %v
@@ -418,23 +365,23 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB25_2
+; CHECK-NEXT:    bltu a2, a3, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -453,56 +400,16 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) {
 ; CHECK-LABEL: select_evl_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 17)
   ret <32 x i64> %v
@@ -621,20 +528,20 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a3, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB35_2:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    addi a0, a2, -32
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 4
-; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index 557882ee31d4c..75f0119d14c2a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -5,26 +5,26 @@
 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a2, 0(a2)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    lbu a1, 0(a2)
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    andi a4, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    slli a4, a2, 29
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    slli a4, a1, 29
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    slli a2, a1, 28
 ; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a4
-; RV32-NEXT:    slli a4, a2, 27
-; RV32-NEXT:    srli a2, a2, 5
-; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    slli a4, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    srli a4, a4, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -35,26 +35,26 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a2, 0(a2)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
-; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    lbu a1, 0(a2)
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    andi a4, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    slli a4, a2, 61
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    slli a4, a1, 61
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    slli a2, a1, 60
 ; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a4
-; RV64-NEXT:    slli a4, a2, 59
-; RV64-NEXT:    srli a2, a2, 5
-; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    slli a4, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    srli a4, a4, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -73,26 +73,26 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a2, 0(a2)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    lbu a1, 0(a2)
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    andi a4, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    slli a4, a2, 29
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    slli a4, a1, 29
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    slli a2, a1, 28
 ; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a4
-; RV32-NEXT:    slli a4, a2, 27
-; RV32-NEXT:    srli a2, a2, 5
-; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    slli a4, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    srli a4, a4, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -103,26 +103,26 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a2, 0(a2)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
-; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    lbu a1, 0(a2)
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    andi a4, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    slli a4, a2, 61
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    slli a4, a1, 61
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    slli a2, a1, 60
 ; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a4
-; RV64-NEXT:    slli a4, a2, 59
-; RV64-NEXT:    srli a2, a2, 5
-; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    slli a4, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    srli a4, a4, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -142,26 +142,26 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vi_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a1, 0(a1)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    lbu a0, 0(a1)
+; RV32-NEXT:    slli a1, a0, 30
+; RV32-NEXT:    andi a3, a0, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    slli a3, a1, 29
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    slli a3, a0, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a3
-; RV32-NEXT:    slli a3, a1, 27
-; RV32-NEXT:    srli a1, a1, 5
-; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    slli a3, a0, 27
+; RV32-NEXT:    srli a0, a0, 5
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    srli a3, a3, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -172,26 +172,26 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vi_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    slli a1, a0, 62
+; RV64-NEXT:    andi a3, a0, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    slli a3, a1, 61
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    slli a3, a0, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a3
-; RV64-NEXT:    slli a3, a1, 59
-; RV64-NEXT:    srli a1, a1, 5
-; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    slli a3, a0, 59
+; RV64-NEXT:    srli a0, a0, 5
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    srli a3, a3, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -210,26 +210,26 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a2, 0(a2)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
-; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    andi a4, a2, 1
+; RV32-NEXT:    lbu a1, 0(a2)
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    andi a4, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    slli a4, a2, 29
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    slli a4, a1, 29
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    slli a2, a1, 28
 ; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a4
-; RV32-NEXT:    slli a4, a2, 27
-; RV32-NEXT:    srli a2, a2, 5
-; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    slli a4, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    srli a4, a4, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -240,26 +240,26 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a2, 0(a2)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
-; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    andi a4, a2, 1
+; RV64-NEXT:    lbu a1, 0(a2)
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    andi a4, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    slli a4, a2, 61
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    slli a4, a1, 61
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    slli a2, a1, 60
 ; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a4
-; RV64-NEXT:    slli a4, a2, 59
-; RV64-NEXT:    srli a2, a2, 5
-; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    slli a4, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    srli a4, a4, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -278,26 +278,26 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a1, 0(a1)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    lbu a0, 0(a1)
+; RV32-NEXT:    slli a1, a0, 30
+; RV32-NEXT:    andi a3, a0, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    slli a3, a1, 29
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    slli a3, a0, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a3
-; RV32-NEXT:    slli a3, a1, 27
-; RV32-NEXT:    srli a1, a1, 5
-; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    slli a3, a0, 27
+; RV32-NEXT:    srli a0, a0, 5
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    srli a3, a3, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -308,26 +308,26 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    slli a1, a0, 62
+; RV64-NEXT:    andi a3, a0, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    slli a3, a1, 61
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    slli a3, a0, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a3
-; RV64-NEXT:    slli a3, a1, 59
-; RV64-NEXT:    srli a1, a1, 5
-; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    slli a3, a0, 59
+; RV64-NEXT:    srli a0, a0, 5
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    srli a3, a3, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -347,26 +347,26 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vfpzero_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lbu a1, 0(a1)
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    andi a3, a1, 1
+; RV32-NEXT:    lbu a0, 0(a1)
+; RV32-NEXT:    slli a1, a0, 30
+; RV32-NEXT:    andi a3, a0, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    slli a3, a1, 29
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    slli a3, a0, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a3
-; RV32-NEXT:    slli a3, a1, 27
-; RV32-NEXT:    srli a1, a1, 5
-; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    slli a3, a0, 27
+; RV32-NEXT:    srli a0, a0, 5
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    srli a3, a3, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
@@ -377,26 +377,26 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vfpzero_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    andi a3, a1, 1
+; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    slli a1, a0, 62
+; RV64-NEXT:    andi a3, a0, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    slli a3, a1, 61
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    slli a3, a0, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a3
-; RV64-NEXT:    slli a3, a1, 59
-; RV64-NEXT:    srli a1, a1, 5
-; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    slli a3, a0, 59
+; RV64-NEXT:    srli a0, a0, 5
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    srli a3, a3, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
@@ -415,8 +415,8 @@ define void @vselect_vv_v8i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -432,8 +432,8 @@ define void @vselect_vx_v8i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -450,8 +450,8 @@ define void @vselect_vi_v8i32(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vi_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -466,8 +466,8 @@ define void @vselect_vv_v8f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -483,8 +483,8 @@ define void @vselect_vx_v8f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -501,8 +501,8 @@ define void @vselect_vfpzero_v8f32(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vfpzero_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse32.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -517,8 +517,8 @@ define void @vselect_vv_v16i16(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vv_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -534,8 +534,8 @@ define void @vselect_vx_v16i16(i16 signext %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vx_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -552,8 +552,8 @@ define void @vselect_vi_v16i16(ptr %b, ptr %cc, ptr %z) {
 ; CHECK-LABEL: vselect_vi_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 4, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -569,8 +569,8 @@ define void @vselect_vv_v32f16(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    vsetvli zero, a4, e16, m4, ta, mu
-; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    ret
@@ -587,8 +587,8 @@ define void @vselect_vx_v32f16(half %a, ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
@@ -606,8 +606,8 @@ define void @vselect_vfpzero_v32f16(ptr %b, ptr %cc, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 6ddf2e464750e..3e64b019643d1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -1410,9 +1410,9 @@ declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i
 define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssub_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB108_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index c403593894794..8ad1fc384364b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -1405,9 +1405,9 @@ declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i
 define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vssubu_vx_v32i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB108_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
index d241b78e41391..5a343b35e8fad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll
@@ -41,8 +41,8 @@ define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vwaddu.vv v12, v8, v10
 ; CHECK-NEXT:    vmv4r.v v8, v12
@@ -77,8 +77,8 @@ define <8 x i64> @vwadd_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v10, 1
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v16, v10, v8, v0
 ; CHECK-NEXT:    vwadd.wv v8, v12, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
index 50184796b38f5..98188799fcca5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
@@ -418,8 +418,8 @@ define <4 x i64> @vwadd_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwadd_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsext.vf4 v11, v8
 ; CHECK-NEXT:    vwadd.vv v8, v10, v11
 ; CHECK-NEXT:    ret
@@ -695,10 +695,10 @@ define <8 x i16> @vwadd_vx_v8i16_i8(ptr %x, ptr %y) {
 define <8 x i16> @vwadd_vx_v8i16_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwadd_vx_v8i16_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vwadd.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -750,10 +750,10 @@ define <4 x i32> @vwadd_vx_v4i32_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwadd_vx_v4i32_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwadd.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -824,11 +824,11 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
-; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero
@@ -838,10 +838,10 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwadd_vx_v2i64_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vwadd.wv v8, v8, v9
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index 98f246b8741dc..b553019568b4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -418,8 +418,8 @@ define <4 x i64> @vwaddu_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwaddu_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vzext.vf4 v11, v8
 ; CHECK-NEXT:    vwaddu.vv v8, v10, v11
 ; CHECK-NEXT:    ret
@@ -695,10 +695,10 @@ define <8 x i16> @vwaddu_vx_v8i16_i8(ptr %x, ptr %y) {
 define <8 x i16> @vwaddu_vx_v8i16_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwaddu_vx_v8i16_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vwaddu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -750,10 +750,10 @@ define <4 x i32> @vwaddu_vx_v4i32_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwaddu_vx_v4i32_i32(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwaddu_vx_v4i32_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwaddu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -866,11 +866,11 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
-; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero
@@ -880,10 +880,10 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwaddu_vx_v2i64_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vwaddu.wv v8, v8, v9
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index eb7be14abe431..115113045548b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -454,8 +454,8 @@ define <4 x i64> @vwmul_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwmul_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsext.vf4 v11, v8
 ; CHECK-NEXT:    vwmul.vv v8, v10, v11
 ; CHECK-NEXT:    ret
@@ -859,11 +859,11 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 8626b25a9d323..ce84e9fa0cbfd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -446,8 +446,8 @@ define <4 x i64> @vwmulsu_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwmulsu_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsext.vf4 v11, v8
 ; CHECK-NEXT:    vwmulsu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
@@ -740,10 +740,10 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8(ptr %x, ptr %y) {
 define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lb a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwmulsu.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <8 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 007b561a2247a..9adaefd37abab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -430,8 +430,8 @@ define <4 x i64> @vwmulu_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwmulu_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vzext.vf4 v11, v8
 ; CHECK-NEXT:    vwmulu.vv v8, v10, v11
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
index 382f00913cb41..36af235446425 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll
@@ -41,8 +41,8 @@ define <8 x i64> @vwsubu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    vwsubu.vv v12, v10, v8
 ; CHECK-NEXT:    vmv4r.v v8, v12
@@ -60,8 +60,8 @@ define <8 x i64> @vwsub_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v10, 1
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v16, v10, v8, v0
 ; CHECK-NEXT:    vwsub.wv v8, v12, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
index 7a925165d9816..5d3e39f96d567 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
@@ -418,8 +418,8 @@ define <4 x i64> @vwsub_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsext.vf4 v11, v8
 ; CHECK-NEXT:    vwsub.vv v8, v10, v11
 ; CHECK-NEXT:    ret
@@ -677,10 +677,10 @@ define <16 x i64> @vwsub_vx_v16i64(ptr %x, i32 %y) {
 define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_vx_v8i16_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lb a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <8 x i8>, ptr %x
@@ -696,10 +696,10 @@ define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) {
 define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_vx_v8i16_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vwsub.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -715,10 +715,10 @@ define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_vx_v4i32_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lb a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
@@ -734,10 +734,10 @@ define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) {
 define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_vx_v4i32_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
@@ -753,10 +753,10 @@ define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsub_vx_v4i32_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwsub.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -772,10 +772,10 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) {
 define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: vwsub_vx_v2i64_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lb a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -791,10 +791,10 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: vwsub_vx_v2i64_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -810,10 +810,10 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: vwsub_vx_v2i64_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsub.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -830,11 +830,11 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
-; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero
@@ -844,10 +844,10 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsub_vx_v2i64_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vwsub.wv v8, v8, v9
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 4c08a8c15a388..bbe1ba03bdb6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -418,8 +418,8 @@ define <4 x i64> @vwsubu_v4i64_v4i32_v4i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v4i64_v4i32_v4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vzext.vf4 v11, v8
 ; CHECK-NEXT:    vwsubu.vv v8, v10, v11
 ; CHECK-NEXT:    ret
@@ -677,10 +677,10 @@ define <16 x i64> @vwsubu_vx_v16i64(ptr %x, i32 %y) {
 define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_vx_v8i16_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lbu a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsubu.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <8 x i8>, ptr %x
@@ -696,10 +696,10 @@ define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) {
 define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_vx_v8i16_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vwsubu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -715,10 +715,10 @@ define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_vx_v4i32_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lbu a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsubu.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
@@ -734,10 +734,10 @@ define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) {
 define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_vx_v4i32_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lhu a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    lhu a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vwsubu.vv v8, v10, v9
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
@@ -753,10 +753,10 @@ define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) {
 define <4 x i32> @vwsubu_vx_v4i32_i32(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_vx_v4i32_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwsubu.wv v8, v8, v9
 ; CHECK-NEXT:    ret
@@ -786,10 +786,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsubu_vx_v2i64_i8:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lbu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    vwsubu.vv v8, v10, v9
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -819,10 +819,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsubu_vx_v2i64_i16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lhu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    lhu a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    vwsubu.vv v8, v10, v9
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -852,10 +852,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsubu_vx_v2i64_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lwu a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    lwu a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    vwsubu.vv v8, v10, v9
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, ptr %x
@@ -872,11 +872,11 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    lw a2, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
-; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    lw a0, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero
@@ -886,10 +886,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsubu_vx_v2i64_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-NEXT:    vwsubu.wv v8, v8, v9
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index df90dae379c06..b38701ebd3448 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -151,9 +151,9 @@ declare <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32>, <32 x i1>, i32)
 define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vzext_v32i64_v32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v16, v0, 2
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
index b7661bd826fed..ad973b72b271f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
@@ -405,8 +405,8 @@ define <vscale x 1 x i8> @ceil_nxv1f32_to_si8(<vscale x 1 x float> %x) {
 ; RV32-NEXT:    vfabs.v v9, v8
 ; RV32-NEXT:    lui a0, 307200
 ; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -424,8 +424,8 @@ define <vscale x 1 x i8> @ceil_nxv1f32_to_si8(<vscale x 1 x float> %x) {
 ; RV64-NEXT:    vfabs.v v9, v8
 ; RV64-NEXT:    lui a0, 307200
 ; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -448,8 +448,8 @@ define <vscale x 1 x i8> @ceil_nxv1f32_to_ui8(<vscale x 1 x float> %x) {
 ; RV32-NEXT:    vfabs.v v9, v8
 ; RV32-NEXT:    lui a0, 307200
 ; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v9, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -467,8 +467,8 @@ define <vscale x 1 x i8> @ceil_nxv1f32_to_ui8(<vscale x 1 x float> %x) {
 ; RV64-NEXT:    vfabs.v v9, v8
 ; RV64-NEXT:    lui a0, 307200
 ; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v9, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -631,8 +631,8 @@ define <vscale x 4 x i8> @ceil_nxv4f32_to_si8(<vscale x 4 x float> %x) {
 ; RV32-NEXT:    vfabs.v v10, v8
 ; RV32-NEXT:    lui a0, 307200
 ; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v10, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -650,8 +650,8 @@ define <vscale x 4 x i8> @ceil_nxv4f32_to_si8(<vscale x 4 x float> %x) {
 ; RV64-NEXT:    vfabs.v v10, v8
 ; RV64-NEXT:    lui a0, 307200
 ; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v10, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -674,8 +674,8 @@ define <vscale x 4 x i8> @ceil_nxv4f32_to_ui8(<vscale x 4 x float> %x) {
 ; RV32-NEXT:    vfabs.v v10, v8
 ; RV32-NEXT:    lui a0, 307200
 ; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
 ; RV32-NEXT:    fsrmi a0, 3
+; RV32-NEXT:    vmflt.vf v0, v10, fa5
 ; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; RV32-NEXT:    fsrm a0
 ; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -693,8 +693,8 @@ define <vscale x 4 x i8> @ceil_nxv4f32_to_ui8(<vscale x 4 x float> %x) {
 ; RV64-NEXT:    vfabs.v v10, v8
 ; RV64-NEXT:    lui a0, 307200
 ; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
 ; RV64-NEXT:    fsrmi a0, 3
+; RV64-NEXT:    vmflt.vf v0, v10, fa5
 ; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; RV64-NEXT:    fsrm a0
 ; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index f9b5095c9af1d..c8b5487b3aee6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -22,12 +22,12 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -49,11 +49,11 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -76,12 +76,12 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -103,11 +103,11 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -157,11 +157,11 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -184,12 +184,12 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -211,11 +211,11 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -238,12 +238,12 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -265,11 +265,11 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 2
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,11 +316,10 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 2
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -336,11 +336,11 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 2
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,11 +395,10 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 2
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -413,10 +413,10 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -439,13 +439,13 @@ declare <vscale x 1 x half> @llvm.vp.floor.nxv1f16(<vscale x 1 x half>, <vscale
 define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -461,12 +461,12 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -485,12 +485,12 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -503,11 +503,11 @@ define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -525,13 +525,13 @@ declare <vscale x 2 x half> @llvm.vp.floor.nxv2f16(<vscale x 2 x half>, <vscale
 define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -547,12 +547,12 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -571,12 +571,12 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -589,11 +589,11 @@ define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -611,13 +611,13 @@ declare <vscale x 4 x half> @llvm.vp.floor.nxv4f16(<vscale x 4 x half>, <vscale
 define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -633,12 +633,12 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -657,12 +657,12 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -675,11 +675,11 @@ define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -699,12 +699,12 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -721,12 +721,12 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -745,12 +745,12 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -763,11 +763,11 @@ define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -787,12 +787,12 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -809,12 +809,12 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -833,12 +833,12 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -851,11 +851,11 @@ define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -875,12 +875,12 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 2
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 2
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,11 +921,10 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -941,11 +941,11 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -970,12 +970,12 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 2
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 2
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,11 +1015,10 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -1033,10 +1033,10 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -1064,9 +1064,9 @@ define <vscale x 1 x float> @vp_floor_nxv1f32(<vscale x 1 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1085,8 +1085,8 @@ define <vscale x 1 x float> @vp_floor_nxv1f32_unmasked(<vscale x 1 x float> %va,
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1106,9 +1106,9 @@ define <vscale x 2 x float> @vp_floor_nxv2f32(<vscale x 2 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1127,8 +1127,8 @@ define <vscale x 2 x float> @vp_floor_nxv2f32_unmasked(<vscale x 2 x float> %va,
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1149,9 +1149,9 @@ define <vscale x 4 x float> @vp_floor_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1171,8 +1171,8 @@ define <vscale x 4 x float> @vp_floor_nxv4f32_unmasked(<vscale x 4 x float> %va,
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1193,9 +1193,9 @@ define <vscale x 8 x float> @vp_floor_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1215,8 +1215,8 @@ define <vscale x 8 x float> @vp_floor_nxv8f32_unmasked(<vscale x 8 x float> %va,
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1237,9 +1237,9 @@ define <vscale x 16 x float> @vp_floor_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1259,8 +1259,8 @@ define <vscale x 16 x float> @vp_floor_nxv16f32_unmasked(<vscale x 16 x float> %
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1276,13 +1276,13 @@ declare <vscale x 1 x double> @llvm.vp.floor.nxv1f64(<vscale x 1 x double>, <vsc
 define <vscale x 1 x double> @vp_floor_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1297,12 +1297,12 @@ define <vscale x 1 x double> @vp_floor_nxv1f64(<vscale x 1 x double> %va, <vscal
 define <vscale x 1 x double> @vp_floor_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1320,12 +1320,12 @@ define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1341,12 +1341,12 @@ define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscal
 define <vscale x 2 x double> @vp_floor_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1364,12 +1364,12 @@ define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1385,12 +1385,12 @@ define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscal
 define <vscale x 4 x double> @vp_floor_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1408,12 +1408,12 @@ define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1429,12 +1429,12 @@ define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscal
 define <vscale x 7 x double> @vp_floor_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1452,12 +1452,12 @@ define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1473,12 +1473,12 @@ define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscal
 define <vscale x 8 x double> @vp_floor_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1498,59 +1498,66 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    fsrmi a3, 2
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1570,12 +1577,12 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    fsrmi a3, 2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a2, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
@@ -1585,8 +1592,8 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 7fad68dbfbbda..42903f0d85e32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -22,16 +22,14 @@ define <vscale x 1 x bfloat> @vfmax_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmax_nxv1bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmax.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -45,16 +43,14 @@ define <vscale x 2 x bfloat> @vfmax_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmax_nxv2bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmax.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -68,15 +64,13 @@ define <vscale x 4 x bfloat> @vfmax_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmax_nxv4bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
+; CHECK-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; CHECK-NEXT:    vfmax.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -91,15 +85,13 @@ define <vscale x 8 x bfloat> @vfmax_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmax_nxv8bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; CHECK-NEXT:    vfmax.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -317,16 +309,14 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-LABEL: vfmax_nxv1f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -350,16 +340,14 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-LABEL: vfmax_nxv2f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -383,15 +371,13 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-LABEL: vfmax_nxv4f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -416,15 +402,13 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-LABEL: vfmax_nxv8f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -754,10 +738,12 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
@@ -786,10 +772,12 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index d56e46f7db3ab..06bc9d6e855a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -42,16 +42,14 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmax.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -87,16 +85,14 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmax.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -134,15 +130,13 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
+; CHECK-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; CHECK-NEXT:    vfmax.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -181,15 +175,13 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; CHECK-NEXT:    vfmax.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -633,16 +625,14 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -701,16 +691,14 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -771,15 +759,13 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -843,15 +829,13 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1615,8 +1599,6 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1644,23 +1626,21 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
-; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB40_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
@@ -1707,14 +1687,10 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
@@ -1726,45 +1702,36 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vfmax.vv v8, v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfmax.vv v16, v16, v8
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB41_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB41_2:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v24
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 8cae0bbc03c8e..3dc02bb4a5a11 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -22,16 +22,14 @@ define <vscale x 1 x bfloat> @vfmin_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmin_nxv1bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmin.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -45,16 +43,14 @@ define <vscale x 2 x bfloat> @vfmin_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmin_nxv2bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmin.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -68,15 +64,13 @@ define <vscale x 4 x bfloat> @vfmin_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmin_nxv4bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
+; CHECK-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; CHECK-NEXT:    vfmin.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -91,15 +85,13 @@ define <vscale x 8 x bfloat> @vfmin_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscal
 ; CHECK-LABEL: vfmin_nxv8bf16_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; CHECK-NEXT:    vfmin.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -317,16 +309,14 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
 ; ZVFHMIN-LABEL: vfmin_nxv1f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -350,16 +340,14 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
 ; ZVFHMIN-LABEL: vfmin_nxv2f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -383,15 +371,13 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
 ; ZVFHMIN-LABEL: vfmin_nxv4f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -416,15 +402,13 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
 ; ZVFHMIN-LABEL: vfmin_nxv8f16_vv:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -754,10 +738,12 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
@@ -786,10 +772,12 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 81e4a548f560e..7c1cc1a76e73a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -42,16 +42,14 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmin.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -87,16 +85,14 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v9, v10, v8, v0
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v8, v9
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
+; CHECK-NEXT:    vfmin.vv v9, v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -134,15 +130,13 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
+; CHECK-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; CHECK-NEXT:    vfmin.vv v10, v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
@@ -181,15 +175,13 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; CHECK-NEXT:    vfmin.vv v12, v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
@@ -633,16 +625,14 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -701,16 +691,14 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v9, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -771,15 +759,13 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -843,15 +829,13 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v12, v12, v16, v0
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1615,8 +1599,6 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1644,23 +1626,21 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
-; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB40_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
@@ -1707,14 +1687,10 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
@@ -1726,45 +1702,36 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vfmin.vv v8, v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfmin.vv v16, v16, v8
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB41_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB41_2:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v24
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index 7a4695d1c25c1..3276c68b9b6ea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -12,17 +12,17 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.experimental.constrained.nearbyint.nxv1f16(<vscale x 1 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x half> %r
@@ -36,17 +36,17 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.experimental.constrained.nearbyint.nxv2f16(<vscale x 2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x half> %r
@@ -60,17 +60,17 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.experimental.constrained.nearbyint.nxv4f16(<vscale x 4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x half> %r
@@ -84,17 +84,17 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.experimental.constrained.nearbyint.nxv8f16(<vscale x 8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %r
@@ -108,17 +108,17 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.experimental.constrained.nearbyint.nxv16f16(<vscale x 16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x half> %r
@@ -132,17 +132,17 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.experimental.constrained.nearbyint.nxv32f16(<vscale x 32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 32 x half> %r
@@ -158,15 +158,15 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x float> @llvm.experimental.constrained.nearbyint.nxv1f32(<vscale x 1 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x float> %r
@@ -182,15 +182,15 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x float> @llvm.experimental.constrained.nearbyint.nxv2f32(<vscale x 2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x float> %r
@@ -206,15 +206,15 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x float> @llvm.experimental.constrained.nearbyint.nxv4f32(<vscale x 4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %r
@@ -230,15 +230,15 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x float> @llvm.experimental.constrained.nearbyint.nxv8f32(<vscale x 8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %r
@@ -254,15 +254,15 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x float> @llvm.experimental.constrained.nearbyint.nxv16f32(<vscale x 16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x float> %r
@@ -276,17 +276,17 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x double> @llvm.experimental.constrained.nearbyint.nxv1f64(<vscale x 1 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x double> %r
@@ -300,17 +300,17 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x double> @llvm.experimental.constrained.nearbyint.nxv2f64(<vscale x 2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %r
@@ -324,17 +324,17 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x double> @llvm.experimental.constrained.nearbyint.nxv4f64(<vscale x 4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %r
@@ -348,17 +348,17 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x double> @llvm.experimental.constrained.nearbyint.nxv8f64(<vscale x 8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 807a3e460b153..f08f7669572c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -18,18 +18,18 @@ define <vscale x 1 x bfloat> @nearbyint_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x bfloat> @llvm.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %x)
   ret <vscale x 1 x bfloat> %a
@@ -41,18 +41,18 @@ define <vscale x 2 x bfloat> @nearbyint_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %x)
   ret <vscale x 2 x bfloat> %a
@@ -64,18 +64,18 @@ define <vscale x 4 x bfloat> @nearbyint_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %x)
   ret <vscale x 4 x bfloat> %a
@@ -87,18 +87,18 @@ define <vscale x 8 x bfloat> @nearbyint_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %x)
   ret <vscale x 8 x bfloat> %a
@@ -110,18 +110,18 @@ define <vscale x 16 x bfloat> @nearbyint_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x bfloat> @llvm.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %x)
   ret <vscale x 16 x bfloat> %a
@@ -163,11 +163,11 @@ define <vscale x 32 x bfloat> @nearbyint_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -182,17 +182,17 @@ define <vscale x 32 x bfloat> @nearbyint_nxv32bf16(<vscale x 32 x bfloat> %x) {
 define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv1f16:
@@ -200,18 +200,18 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half> %x)
   ret <vscale x 1 x half> %a
@@ -221,17 +221,17 @@ declare <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half>)
 define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv2f16:
@@ -239,18 +239,18 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half> %x)
   ret <vscale x 2 x half> %a
@@ -260,17 +260,17 @@ declare <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv4f16:
@@ -278,18 +278,18 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x half> %a
@@ -299,17 +299,17 @@ declare <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half>)
 define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv8f16:
@@ -317,18 +317,18 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half> %x)
   ret <vscale x 8 x half> %a
@@ -338,17 +338,17 @@ declare <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half>)
 define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv16f16:
@@ -356,18 +356,18 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half> %x)
   ret <vscale x 16 x half> %a
@@ -377,17 +377,17 @@ declare <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half>)
 define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFH-LABEL: nearbyint_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: nearbyint_nxv32f16:
@@ -425,11 +425,11 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -449,13 +449,13 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x float> @llvm.nearbyint.nxv1f32(<vscale x 1 x float> %x)
   ret <vscale x 1 x float> %a
@@ -469,13 +469,13 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.nearbyint.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x float> %a
@@ -489,13 +489,13 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x float> %a
@@ -509,13 +509,13 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x float> @llvm.nearbyint.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x float> %a
@@ -529,13 +529,13 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x float> @llvm.nearbyint.nxv16f32(<vscale x 16 x float> %x)
   ret <vscale x 16 x float> %a
@@ -545,17 +545,17 @@ declare <vscale x 16 x float> @llvm.nearbyint.nxv16f32(<vscale x 16 x float>)
 define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-LABEL: nearbyint_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.nearbyint.nxv1f64(<vscale x 1 x double> %x)
   ret <vscale x 1 x double> %a
@@ -565,17 +565,17 @@ declare <vscale x 1 x double> @llvm.nearbyint.nxv1f64(<vscale x 1 x double>)
 define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-LABEL: nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %x)
   ret <vscale x 2 x double> %a
@@ -585,17 +585,17 @@ declare <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.nearbyint.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x double> %a
@@ -605,17 +605,17 @@ declare <vscale x 4 x double> @llvm.nearbyint.nxv4f64(<vscale x 4 x double>)
 define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-LABEL: nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x double> @llvm.nearbyint.nxv8f64(<vscale x 8 x double> %x)
   ret <vscale x 8 x double> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
index 5e657a93ec0d6..a420e9ecee563 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
@@ -7,10 +7,10 @@
 define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV32-LABEL: test:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a2, 1
-; RV32-NEXT:    th.lbib a4, (a1), -1, 0
+; RV32-NEXT:    th.lbib a3, (a1), -1, 0
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    addi a3, a2, 1
 ; RV32-NEXT:    vmv.s.x v9, zero
 ; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; RV32-NEXT:    vslideup.vx v8, v9, a2
@@ -33,10 +33,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ;
 ; RV64-LABEL: test:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a2, 1
-; RV64-NEXT:    th.lbib a4, (a1), -1, 0
+; RV64-NEXT:    th.lbib a3, (a1), -1, 0
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a4
+; RV64-NEXT:    vmv.v.x v8, a3
+; RV64-NEXT:    addi a3, a2, 1
 ; RV64-NEXT:    vmv.s.x v9, zero
 ; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; RV64-NEXT:    vslideup.vx v8, v9, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll b/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll
index e24b23c9b2d32..7504c570e6c7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll
@@ -15,11 +15,11 @@ define i32 @test(i32 %call.i) {
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vmv.v.x v8, a0
 ; CHECK-V-NEXT:    lui a0, 524288
+; CHECK-V-NEXT:    vmv.v.i v9, 0
 ; CHECK-V-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-V-NEXT:    addi a0, a0, 2
 ; CHECK-V-NEXT:    vmslt.vx v0, v8, a0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vmv.x.s a0, v8
 ; CHECK-V-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index f6598606b09f1..052a10e0adcdc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -9,10 +9,10 @@
 define <2 x i32> @stest_f64i32(<2 x double> %x) {
 ; CHECK-NOV-LABEL: stest_f64i32:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.l.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 524288
 ; CHECK-NOV-NEXT:    addiw a3, a2, -1
-; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a1, a3, .LBB0_5
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bge a0, a3, .LBB0_6
@@ -55,10 +55,10 @@ entry:
 define <2 x i32> @utest_f64i32(<2 x double> %x) {
 ; CHECK-NOV-LABEL: utest_f64i32:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.lu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    fcvt.lu.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
 ; CHECK-NOV-NEXT:    srli a2, a2, 32
-; CHECK-NOV-NEXT:    fcvt.lu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    bgeu a0, a2, .LBB1_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bgeu a1, a2, .LBB1_4
@@ -89,10 +89,10 @@ entry:
 define <2 x i32> @ustest_f64i32(<2 x double> %x) {
 ; CHECK-NOV-LABEL: ustest_f64i32:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.l.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
 ; CHECK-NOV-NEXT:    srli a2, a2, 32
-; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a1, a2, .LBB2_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    mv a1, a2
@@ -130,14 +130,14 @@ entry:
 define <4 x i32> @stest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-LABEL: stest_f32i32:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.l.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    fcvt.l.s a1, fa2, rtz
+; CHECK-NOV-NEXT:    fcvt.l.s a2, fa3, rtz
 ; CHECK-NOV-NEXT:    lui a3, 524288
 ; CHECK-NOV-NEXT:    addiw a6, a3, -1
-; CHECK-NOV-NEXT:    fcvt.l.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a6, .LBB3_10
+; CHECK-NOV-NEXT:    bge a2, a6, .LBB3_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a6, .LBB3_11
+; CHECK-NOV-NEXT:    bge a1, a6, .LBB3_11
 ; CHECK-NOV-NEXT:  .LBB3_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a4, a6, .LBB3_12
@@ -148,23 +148,23 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB3_5: # %entry
 ; CHECK-NOV-NEXT:    bge a3, a4, .LBB3_15
 ; CHECK-NOV-NEXT:  .LBB3_6: # %entry
-; CHECK-NOV-NEXT:    bge a3, a2, .LBB3_16
+; CHECK-NOV-NEXT:    bge a3, a1, .LBB3_16
 ; CHECK-NOV-NEXT:  .LBB3_7: # %entry
-; CHECK-NOV-NEXT:    blt a3, a1, .LBB3_9
+; CHECK-NOV-NEXT:    blt a3, a2, .LBB3_9
 ; CHECK-NOV-NEXT:  .LBB3_8: # %entry
-; CHECK-NOV-NEXT:    lui a1, 524288
+; CHECK-NOV-NEXT:    lui a2, 524288
 ; CHECK-NOV-NEXT:  .LBB3_9: # %entry
 ; CHECK-NOV-NEXT:    sw a5, 0(a0)
 ; CHECK-NOV-NEXT:    sw a4, 4(a0)
-; CHECK-NOV-NEXT:    sw a2, 8(a0)
-; CHECK-NOV-NEXT:    sw a1, 12(a0)
+; CHECK-NOV-NEXT:    sw a1, 8(a0)
+; CHECK-NOV-NEXT:    sw a2, 12(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB3_10: # %entry
-; CHECK-NOV-NEXT:    mv a1, a6
+; CHECK-NOV-NEXT:    mv a2, a6
 ; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a6, .LBB3_2
+; CHECK-NOV-NEXT:    blt a1, a6, .LBB3_2
 ; CHECK-NOV-NEXT:  .LBB3_11: # %entry
-; CHECK-NOV-NEXT:    mv a2, a6
+; CHECK-NOV-NEXT:    mv a1, a6
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a4, a6, .LBB3_3
 ; CHECK-NOV-NEXT:  .LBB3_12: # %entry
@@ -178,10 +178,10 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    blt a3, a4, .LBB3_6
 ; CHECK-NOV-NEXT:  .LBB3_15: # %entry
 ; CHECK-NOV-NEXT:    lui a4, 524288
-; CHECK-NOV-NEXT:    blt a3, a2, .LBB3_7
+; CHECK-NOV-NEXT:    blt a3, a1, .LBB3_7
 ; CHECK-NOV-NEXT:  .LBB3_16: # %entry
-; CHECK-NOV-NEXT:    lui a2, 524288
-; CHECK-NOV-NEXT:    bge a3, a1, .LBB3_8
+; CHECK-NOV-NEXT:    lui a1, 524288
+; CHECK-NOV-NEXT:    bge a3, a2, .LBB3_8
 ; CHECK-NOV-NEXT:    j .LBB3_9
 ;
 ; CHECK-V-LABEL: stest_f32i32:
@@ -203,14 +203,14 @@ entry:
 define <4 x i32> @utest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-LABEL: utest_f32i32:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.lu.s a1, fa0, rtz
+; CHECK-NOV-NEXT:    fcvt.lu.s a1, fa1, rtz
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fa0, rtz
 ; CHECK-NOV-NEXT:    li a3, -1
 ; CHECK-NOV-NEXT:    srli a3, a3, 32
-; CHECK-NOV-NEXT:    fcvt.lu.s a2, fa1, rtz
-; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB4_6
+; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB4_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB4_7
+; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB4_7
 ; CHECK-NOV-NEXT:  .LBB4_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bgeu a4, a3, .LBB4_8
@@ -219,17 +219,17 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB4_4: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB4_5: # %entry
-; CHECK-NOV-NEXT:    sw a1, 0(a0)
-; CHECK-NOV-NEXT:    sw a2, 4(a0)
+; CHECK-NOV-NEXT:    sw a2, 0(a0)
+; CHECK-NOV-NEXT:    sw a1, 4(a0)
 ; CHECK-NOV-NEXT:    sw a4, 8(a0)
 ; CHECK-NOV-NEXT:    sw a5, 12(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB4_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bltu a2, a3, .LBB4_2
+; CHECK-NOV-NEXT:    bltu a1, a3, .LBB4_2
 ; CHECK-NOV-NEXT:  .LBB4_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a1, a3
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bltu a4, a3, .LBB4_3
 ; CHECK-NOV-NEXT:  .LBB4_8: # %entry
@@ -254,10 +254,10 @@ entry:
 define <4 x i32> @ustest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-LABEL: ustest_f32i32:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.l.s a2, fa2, rtz
 ; CHECK-NOV-NEXT:    fcvt.l.s a1, fa3, rtz
 ; CHECK-NOV-NEXT:    li a4, -1
 ; CHECK-NOV-NEXT:    srli a4, a4, 32
-; CHECK-NOV-NEXT:    fcvt.l.s a2, fa2, rtz
 ; CHECK-NOV-NEXT:    bge a1, a4, .LBB5_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a3, fa1, rtz
@@ -341,12 +341,12 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
-; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu a0, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -355,8 +355,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a1, 524288
@@ -454,11 +454,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -473,11 +473,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -541,22 +541,22 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
-; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu a2, 8(a1)
-; CHECK-NOV-NEXT:    lhu s2, 16(a1)
-; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    lhu s2, 0(a1)
+; CHECK-NOV-NEXT:    lhu a0, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a1, -1
@@ -634,11 +634,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -653,11 +653,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -719,12 +719,12 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
-; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu a0, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -733,8 +733,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
@@ -824,11 +824,11 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -843,11 +843,11 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -895,10 +895,10 @@ entry:
 define <2 x i16> @stest_f64i16(<2 x double> %x) {
 ; CHECK-NOV-LABEL: stest_f64i16:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.w.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 8
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a1, a2, .LBB9_5
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bge a0, a2, .LBB9_6
@@ -943,10 +943,10 @@ entry:
 define <2 x i16> @utest_f64i16(<2 x double> %x) {
 ; CHECK-NOV-LABEL: utest_f64i16:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.wu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    fcvt.wu.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a2, 16
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.wu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    bgeu a0, a2, .LBB10_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bgeu a1, a2, .LBB10_4
@@ -977,10 +977,10 @@ entry:
 define <2 x i16> @ustest_f64i16(<2 x double> %x) {
 ; CHECK-NOV-LABEL: ustest_f64i16:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.w.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 16
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a1, a2, .LBB11_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    mv a1, a2
@@ -1018,14 +1018,14 @@ entry:
 define <4 x i16> @stest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-LABEL: stest_f32i16:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.w.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    fcvt.w.s a1, fa2, rtz
+; CHECK-NOV-NEXT:    fcvt.w.s a2, fa3, rtz
 ; CHECK-NOV-NEXT:    lui a5, 8
 ; CHECK-NOV-NEXT:    addiw a5, a5, -1
-; CHECK-NOV-NEXT:    fcvt.w.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a5, .LBB12_10
+; CHECK-NOV-NEXT:    bge a2, a5, .LBB12_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a5, .LBB12_11
+; CHECK-NOV-NEXT:    bge a1, a5, .LBB12_11
 ; CHECK-NOV-NEXT:  .LBB12_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a4, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a3, a5, .LBB12_12
@@ -1037,23 +1037,23 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB12_5: # %entry
 ; CHECK-NOV-NEXT:    bge a5, a3, .LBB12_15
 ; CHECK-NOV-NEXT:  .LBB12_6: # %entry
-; CHECK-NOV-NEXT:    bge a5, a2, .LBB12_16
+; CHECK-NOV-NEXT:    bge a5, a1, .LBB12_16
 ; CHECK-NOV-NEXT:  .LBB12_7: # %entry
-; CHECK-NOV-NEXT:    blt a5, a1, .LBB12_9
+; CHECK-NOV-NEXT:    blt a5, a2, .LBB12_9
 ; CHECK-NOV-NEXT:  .LBB12_8: # %entry
-; CHECK-NOV-NEXT:    lui a1, 1048568
+; CHECK-NOV-NEXT:    lui a2, 1048568
 ; CHECK-NOV-NEXT:  .LBB12_9: # %entry
 ; CHECK-NOV-NEXT:    sh a4, 0(a0)
 ; CHECK-NOV-NEXT:    sh a3, 2(a0)
-; CHECK-NOV-NEXT:    sh a2, 4(a0)
-; CHECK-NOV-NEXT:    sh a1, 6(a0)
+; CHECK-NOV-NEXT:    sh a1, 4(a0)
+; CHECK-NOV-NEXT:    sh a2, 6(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB12_10: # %entry
-; CHECK-NOV-NEXT:    mv a1, a5
+; CHECK-NOV-NEXT:    mv a2, a5
 ; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a5, .LBB12_2
+; CHECK-NOV-NEXT:    blt a1, a5, .LBB12_2
 ; CHECK-NOV-NEXT:  .LBB12_11: # %entry
-; CHECK-NOV-NEXT:    mv a2, a5
+; CHECK-NOV-NEXT:    mv a1, a5
 ; CHECK-NOV-NEXT:    fcvt.w.s a4, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a3, a5, .LBB12_3
 ; CHECK-NOV-NEXT:  .LBB12_12: # %entry
@@ -1068,10 +1068,10 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    blt a5, a3, .LBB12_6
 ; CHECK-NOV-NEXT:  .LBB12_15: # %entry
 ; CHECK-NOV-NEXT:    lui a3, 1048568
-; CHECK-NOV-NEXT:    blt a5, a2, .LBB12_7
+; CHECK-NOV-NEXT:    blt a5, a1, .LBB12_7
 ; CHECK-NOV-NEXT:  .LBB12_16: # %entry
-; CHECK-NOV-NEXT:    lui a2, 1048568
-; CHECK-NOV-NEXT:    bge a5, a1, .LBB12_8
+; CHECK-NOV-NEXT:    lui a1, 1048568
+; CHECK-NOV-NEXT:    bge a5, a2, .LBB12_8
 ; CHECK-NOV-NEXT:    j .LBB12_9
 ;
 ; CHECK-V-LABEL: stest_f32i16:
@@ -1094,14 +1094,14 @@ entry:
 define <4 x i16> @utest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-LABEL: utest_f32i16:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.wu.s a1, fa0, rtz
+; CHECK-NOV-NEXT:    fcvt.wu.s a1, fa1, rtz
+; CHECK-NOV-NEXT:    fcvt.wu.s a2, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a3, 16
 ; CHECK-NOV-NEXT:    addiw a3, a3, -1
-; CHECK-NOV-NEXT:    fcvt.wu.s a2, fa1, rtz
-; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB13_6
+; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB13_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.wu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB13_7
+; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB13_7
 ; CHECK-NOV-NEXT:  .LBB13_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.wu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bgeu a4, a3, .LBB13_8
@@ -1110,17 +1110,17 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB13_4: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB13_5: # %entry
-; CHECK-NOV-NEXT:    sh a1, 0(a0)
-; CHECK-NOV-NEXT:    sh a2, 2(a0)
+; CHECK-NOV-NEXT:    sh a2, 0(a0)
+; CHECK-NOV-NEXT:    sh a1, 2(a0)
 ; CHECK-NOV-NEXT:    sh a4, 4(a0)
 ; CHECK-NOV-NEXT:    sh a5, 6(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB13_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:    fcvt.wu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bltu a2, a3, .LBB13_2
+; CHECK-NOV-NEXT:    bltu a1, a3, .LBB13_2
 ; CHECK-NOV-NEXT:  .LBB13_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a1, a3
 ; CHECK-NOV-NEXT:    fcvt.wu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bltu a4, a3, .LBB13_3
 ; CHECK-NOV-NEXT:  .LBB13_8: # %entry
@@ -1146,10 +1146,10 @@ entry:
 define <4 x i16> @ustest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-LABEL: ustest_f32i16:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.w.s a2, fa2, rtz
 ; CHECK-NOV-NEXT:    fcvt.w.s a1, fa3, rtz
 ; CHECK-NOV-NEXT:    lui a4, 16
 ; CHECK-NOV-NEXT:    addiw a4, a4, -1
-; CHECK-NOV-NEXT:    fcvt.w.s a2, fa2, rtz
 ; CHECK-NOV-NEXT:    bge a1, a4, .LBB14_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
@@ -1248,16 +1248,16 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
-; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu a0, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -1278,8 +1278,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a7, 8
@@ -1458,7 +1458,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -1466,6 +1465,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1483,7 +1483,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -1491,6 +1490,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1515,11 +1515,11 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1534,11 +1534,11 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1632,16 +1632,16 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 0(a1)
-; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a0, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
@@ -1662,8 +1662,8 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a3, 16
@@ -1800,7 +1800,6 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -1808,6 +1807,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1825,7 +1825,6 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -1833,6 +1832,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1857,11 +1857,11 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1876,11 +1876,11 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -1972,16 +1972,16 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
-; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu a0, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -2002,8 +2002,8 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a4, 16
@@ -2164,7 +2164,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -2172,6 +2171,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -2189,7 +2189,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -2197,6 +2196,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -2221,11 +2221,11 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -2240,11 +2240,11 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -3576,10 +3576,10 @@ entry:
 define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: stest_f64i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.l.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 524288
 ; CHECK-NOV-NEXT:    addiw a3, a2, -1
-; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a1, a3, .LBB27_5
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bge a0, a3, .LBB27_6
@@ -3620,10 +3620,10 @@ entry:
 define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: utest_f64i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.lu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    fcvt.lu.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
 ; CHECK-NOV-NEXT:    srli a2, a2, 32
-; CHECK-NOV-NEXT:    fcvt.lu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    bgeu a0, a2, .LBB28_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bgeu a1, a2, .LBB28_4
@@ -3653,10 +3653,10 @@ entry:
 define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: ustest_f64i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.l.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
 ; CHECK-NOV-NEXT:    srli a2, a2, 32
-; CHECK-NOV-NEXT:    fcvt.l.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a1, a2, .LBB29_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    mv a1, a2
@@ -3692,14 +3692,14 @@ entry:
 define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: stest_f32i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.l.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    fcvt.l.s a1, fa2, rtz
+; CHECK-NOV-NEXT:    fcvt.l.s a2, fa3, rtz
 ; CHECK-NOV-NEXT:    lui a3, 524288
 ; CHECK-NOV-NEXT:    addiw a6, a3, -1
-; CHECK-NOV-NEXT:    fcvt.l.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a6, .LBB30_10
+; CHECK-NOV-NEXT:    bge a2, a6, .LBB30_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a6, .LBB30_11
+; CHECK-NOV-NEXT:    bge a1, a6, .LBB30_11
 ; CHECK-NOV-NEXT:  .LBB30_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a4, a6, .LBB30_12
@@ -3710,23 +3710,23 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB30_5: # %entry
 ; CHECK-NOV-NEXT:    bge a3, a4, .LBB30_15
 ; CHECK-NOV-NEXT:  .LBB30_6: # %entry
-; CHECK-NOV-NEXT:    bge a3, a2, .LBB30_16
+; CHECK-NOV-NEXT:    bge a3, a1, .LBB30_16
 ; CHECK-NOV-NEXT:  .LBB30_7: # %entry
-; CHECK-NOV-NEXT:    blt a3, a1, .LBB30_9
+; CHECK-NOV-NEXT:    blt a3, a2, .LBB30_9
 ; CHECK-NOV-NEXT:  .LBB30_8: # %entry
-; CHECK-NOV-NEXT:    lui a1, 524288
+; CHECK-NOV-NEXT:    lui a2, 524288
 ; CHECK-NOV-NEXT:  .LBB30_9: # %entry
 ; CHECK-NOV-NEXT:    sw a5, 0(a0)
 ; CHECK-NOV-NEXT:    sw a4, 4(a0)
-; CHECK-NOV-NEXT:    sw a2, 8(a0)
-; CHECK-NOV-NEXT:    sw a1, 12(a0)
+; CHECK-NOV-NEXT:    sw a1, 8(a0)
+; CHECK-NOV-NEXT:    sw a2, 12(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB30_10: # %entry
-; CHECK-NOV-NEXT:    mv a1, a6
+; CHECK-NOV-NEXT:    mv a2, a6
 ; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a6, .LBB30_2
+; CHECK-NOV-NEXT:    blt a1, a6, .LBB30_2
 ; CHECK-NOV-NEXT:  .LBB30_11: # %entry
-; CHECK-NOV-NEXT:    mv a2, a6
+; CHECK-NOV-NEXT:    mv a1, a6
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a4, a6, .LBB30_3
 ; CHECK-NOV-NEXT:  .LBB30_12: # %entry
@@ -3740,10 +3740,10 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    blt a3, a4, .LBB30_6
 ; CHECK-NOV-NEXT:  .LBB30_15: # %entry
 ; CHECK-NOV-NEXT:    lui a4, 524288
-; CHECK-NOV-NEXT:    blt a3, a2, .LBB30_7
+; CHECK-NOV-NEXT:    blt a3, a1, .LBB30_7
 ; CHECK-NOV-NEXT:  .LBB30_16: # %entry
-; CHECK-NOV-NEXT:    lui a2, 524288
-; CHECK-NOV-NEXT:    bge a3, a1, .LBB30_8
+; CHECK-NOV-NEXT:    lui a1, 524288
+; CHECK-NOV-NEXT:    bge a3, a2, .LBB30_8
 ; CHECK-NOV-NEXT:    j .LBB30_9
 ;
 ; CHECK-V-LABEL: stest_f32i32_mm:
@@ -3763,14 +3763,14 @@ entry:
 define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: utest_f32i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.lu.s a1, fa0, rtz
+; CHECK-NOV-NEXT:    fcvt.lu.s a1, fa1, rtz
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fa0, rtz
 ; CHECK-NOV-NEXT:    li a3, -1
 ; CHECK-NOV-NEXT:    srli a3, a3, 32
-; CHECK-NOV-NEXT:    fcvt.lu.s a2, fa1, rtz
-; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB31_6
+; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB31_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB31_7
+; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB31_7
 ; CHECK-NOV-NEXT:  .LBB31_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bgeu a4, a3, .LBB31_8
@@ -3779,17 +3779,17 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB31_4: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB31_5: # %entry
-; CHECK-NOV-NEXT:    sw a1, 0(a0)
-; CHECK-NOV-NEXT:    sw a2, 4(a0)
+; CHECK-NOV-NEXT:    sw a2, 0(a0)
+; CHECK-NOV-NEXT:    sw a1, 4(a0)
 ; CHECK-NOV-NEXT:    sw a4, 8(a0)
 ; CHECK-NOV-NEXT:    sw a5, 12(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB31_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bltu a2, a3, .LBB31_2
+; CHECK-NOV-NEXT:    bltu a1, a3, .LBB31_2
 ; CHECK-NOV-NEXT:  .LBB31_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a1, a3
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bltu a4, a3, .LBB31_3
 ; CHECK-NOV-NEXT:  .LBB31_8: # %entry
@@ -3813,50 +3813,50 @@ entry:
 define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: ustest_f32i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.l.s a1, fa3, rtz
-; CHECK-NOV-NEXT:    li a3, -1
-; CHECK-NOV-NEXT:    srli a3, a3, 32
 ; CHECK-NOV-NEXT:    fcvt.l.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a3, .LBB32_6
+; CHECK-NOV-NEXT:    fcvt.l.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    li a4, -1
+; CHECK-NOV-NEXT:    srli a4, a4, 32
+; CHECK-NOV-NEXT:    bge a1, a4, .LBB32_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a3, .LBB32_7
+; CHECK-NOV-NEXT:    fcvt.l.s a3, fa1, rtz
+; CHECK-NOV-NEXT:    bge a2, a4, .LBB32_7
 ; CHECK-NOV-NEXT:  .LBB32_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT:    bge a4, a3, .LBB32_8
+; CHECK-NOV-NEXT:    bge a3, a4, .LBB32_8
 ; CHECK-NOV-NEXT:  .LBB32_3: # %entry
-; CHECK-NOV-NEXT:    blt a5, a3, .LBB32_5
+; CHECK-NOV-NEXT:    blt a5, a4, .LBB32_5
 ; CHECK-NOV-NEXT:  .LBB32_4: # %entry
-; CHECK-NOV-NEXT:    mv a5, a3
+; CHECK-NOV-NEXT:    mv a5, a4
 ; CHECK-NOV-NEXT:  .LBB32_5: # %entry
-; CHECK-NOV-NEXT:    sgtz a3, a5
-; CHECK-NOV-NEXT:    negw a3, a3
-; CHECK-NOV-NEXT:    and a3, a3, a5
-; CHECK-NOV-NEXT:    sgtz a5, a4
+; CHECK-NOV-NEXT:    sgtz a4, a5
+; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    and a4, a4, a5
+; CHECK-NOV-NEXT:    sgtz a5, a3
 ; CHECK-NOV-NEXT:    negw a5, a5
-; CHECK-NOV-NEXT:    and a4, a5, a4
+; CHECK-NOV-NEXT:    and a3, a5, a3
 ; CHECK-NOV-NEXT:    sgtz a5, a2
 ; CHECK-NOV-NEXT:    negw a5, a5
 ; CHECK-NOV-NEXT:    and a2, a5, a2
 ; CHECK-NOV-NEXT:    sgtz a5, a1
 ; CHECK-NOV-NEXT:    negw a5, a5
 ; CHECK-NOV-NEXT:    and a1, a5, a1
-; CHECK-NOV-NEXT:    sw a3, 0(a0)
-; CHECK-NOV-NEXT:    sw a4, 4(a0)
+; CHECK-NOV-NEXT:    sw a4, 0(a0)
+; CHECK-NOV-NEXT:    sw a3, 4(a0)
 ; CHECK-NOV-NEXT:    sw a2, 8(a0)
 ; CHECK-NOV-NEXT:    sw a1, 12(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB32_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
-; CHECK-NOV-NEXT:    fcvt.l.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a3, .LBB32_2
+; CHECK-NOV-NEXT:    mv a1, a4
+; CHECK-NOV-NEXT:    fcvt.l.s a3, fa1, rtz
+; CHECK-NOV-NEXT:    blt a2, a4, .LBB32_2
 ; CHECK-NOV-NEXT:  .LBB32_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a2, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT:    blt a4, a3, .LBB32_3
+; CHECK-NOV-NEXT:    blt a3, a4, .LBB32_3
 ; CHECK-NOV-NEXT:  .LBB32_8: # %entry
-; CHECK-NOV-NEXT:    mv a4, a3
-; CHECK-NOV-NEXT:    bge a5, a3, .LBB32_4
+; CHECK-NOV-NEXT:    mv a3, a4
+; CHECK-NOV-NEXT:    bge a5, a4, .LBB32_4
 ; CHECK-NOV-NEXT:    j .LBB32_5
 ;
 ; CHECK-V-LABEL: ustest_f32i32_mm:
@@ -3898,12 +3898,12 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
-; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu a0, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -3912,8 +3912,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a1, 524288
@@ -4011,11 +4011,11 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4030,11 +4030,11 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4096,22 +4096,22 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
-; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu a2, 8(a1)
-; CHECK-NOV-NEXT:    lhu s2, 16(a1)
-; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    lhu s2, 0(a1)
+; CHECK-NOV-NEXT:    lhu a0, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a1, -1
@@ -4189,11 +4189,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4208,11 +4208,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4273,12 +4273,12 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
-; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu a0, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -4287,8 +4287,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    li a2, -1
@@ -4378,11 +4378,11 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4397,11 +4397,11 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -4447,10 +4447,10 @@ entry:
 define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: stest_f64i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.w.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 8
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a1, a2, .LBB36_5
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bge a0, a2, .LBB36_6
@@ -4493,10 +4493,10 @@ entry:
 define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: utest_f64i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.wu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    fcvt.wu.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a2, 16
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.wu.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    bgeu a0, a2, .LBB37_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    bgeu a1, a2, .LBB37_4
@@ -4526,10 +4526,10 @@ entry:
 define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) {
 ; CHECK-NOV-LABEL: ustest_f64i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    fcvt.w.d a1, fa1, rtz
 ; CHECK-NOV-NEXT:    lui a2, 16
 ; CHECK-NOV-NEXT:    addiw a2, a2, -1
-; CHECK-NOV-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a1, a2, .LBB38_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    mv a1, a2
@@ -4565,14 +4565,14 @@ entry:
 define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: stest_f32i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.w.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    fcvt.w.s a1, fa2, rtz
+; CHECK-NOV-NEXT:    fcvt.w.s a2, fa3, rtz
 ; CHECK-NOV-NEXT:    lui a5, 8
 ; CHECK-NOV-NEXT:    addiw a5, a5, -1
-; CHECK-NOV-NEXT:    fcvt.w.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a5, .LBB39_10
+; CHECK-NOV-NEXT:    bge a2, a5, .LBB39_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a5, .LBB39_11
+; CHECK-NOV-NEXT:    bge a1, a5, .LBB39_11
 ; CHECK-NOV-NEXT:  .LBB39_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a4, fa0, rtz
 ; CHECK-NOV-NEXT:    bge a3, a5, .LBB39_12
@@ -4584,23 +4584,23 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB39_5: # %entry
 ; CHECK-NOV-NEXT:    bge a5, a3, .LBB39_15
 ; CHECK-NOV-NEXT:  .LBB39_6: # %entry
-; CHECK-NOV-NEXT:    bge a5, a2, .LBB39_16
+; CHECK-NOV-NEXT:    bge a5, a1, .LBB39_16
 ; CHECK-NOV-NEXT:  .LBB39_7: # %entry
-; CHECK-NOV-NEXT:    blt a5, a1, .LBB39_9
+; CHECK-NOV-NEXT:    blt a5, a2, .LBB39_9
 ; CHECK-NOV-NEXT:  .LBB39_8: # %entry
-; CHECK-NOV-NEXT:    lui a1, 1048568
+; CHECK-NOV-NEXT:    lui a2, 1048568
 ; CHECK-NOV-NEXT:  .LBB39_9: # %entry
 ; CHECK-NOV-NEXT:    sh a4, 0(a0)
 ; CHECK-NOV-NEXT:    sh a3, 2(a0)
-; CHECK-NOV-NEXT:    sh a2, 4(a0)
-; CHECK-NOV-NEXT:    sh a1, 6(a0)
+; CHECK-NOV-NEXT:    sh a1, 4(a0)
+; CHECK-NOV-NEXT:    sh a2, 6(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB39_10: # %entry
-; CHECK-NOV-NEXT:    mv a1, a5
+; CHECK-NOV-NEXT:    mv a2, a5
 ; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a5, .LBB39_2
+; CHECK-NOV-NEXT:    blt a1, a5, .LBB39_2
 ; CHECK-NOV-NEXT:  .LBB39_11: # %entry
-; CHECK-NOV-NEXT:    mv a2, a5
+; CHECK-NOV-NEXT:    mv a1, a5
 ; CHECK-NOV-NEXT:    fcvt.w.s a4, fa0, rtz
 ; CHECK-NOV-NEXT:    blt a3, a5, .LBB39_3
 ; CHECK-NOV-NEXT:  .LBB39_12: # %entry
@@ -4615,10 +4615,10 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    blt a5, a3, .LBB39_6
 ; CHECK-NOV-NEXT:  .LBB39_15: # %entry
 ; CHECK-NOV-NEXT:    lui a3, 1048568
-; CHECK-NOV-NEXT:    blt a5, a2, .LBB39_7
+; CHECK-NOV-NEXT:    blt a5, a1, .LBB39_7
 ; CHECK-NOV-NEXT:  .LBB39_16: # %entry
-; CHECK-NOV-NEXT:    lui a2, 1048568
-; CHECK-NOV-NEXT:    bge a5, a1, .LBB39_8
+; CHECK-NOV-NEXT:    lui a1, 1048568
+; CHECK-NOV-NEXT:    bge a5, a2, .LBB39_8
 ; CHECK-NOV-NEXT:    j .LBB39_9
 ;
 ; CHECK-V-LABEL: stest_f32i16_mm:
@@ -4639,14 +4639,14 @@ entry:
 define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: utest_f32i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.wu.s a1, fa0, rtz
+; CHECK-NOV-NEXT:    fcvt.wu.s a1, fa1, rtz
+; CHECK-NOV-NEXT:    fcvt.wu.s a2, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a3, 16
 ; CHECK-NOV-NEXT:    addiw a3, a3, -1
-; CHECK-NOV-NEXT:    fcvt.wu.s a2, fa1, rtz
-; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB40_6
+; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB40_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.wu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bgeu a2, a3, .LBB40_7
+; CHECK-NOV-NEXT:    bgeu a1, a3, .LBB40_7
 ; CHECK-NOV-NEXT:  .LBB40_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.wu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bgeu a4, a3, .LBB40_8
@@ -4655,17 +4655,17 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB40_4: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB40_5: # %entry
-; CHECK-NOV-NEXT:    sh a1, 0(a0)
-; CHECK-NOV-NEXT:    sh a2, 2(a0)
+; CHECK-NOV-NEXT:    sh a2, 0(a0)
+; CHECK-NOV-NEXT:    sh a1, 2(a0)
 ; CHECK-NOV-NEXT:    sh a4, 4(a0)
 ; CHECK-NOV-NEXT:    sh a5, 6(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB40_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:    fcvt.wu.s a4, fa2, rtz
-; CHECK-NOV-NEXT:    bltu a2, a3, .LBB40_2
+; CHECK-NOV-NEXT:    bltu a1, a3, .LBB40_2
 ; CHECK-NOV-NEXT:  .LBB40_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a1, a3
 ; CHECK-NOV-NEXT:    fcvt.wu.s a5, fa3, rtz
 ; CHECK-NOV-NEXT:    bltu a4, a3, .LBB40_3
 ; CHECK-NOV-NEXT:  .LBB40_8: # %entry
@@ -4690,50 +4690,50 @@ entry:
 define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-LABEL: ustest_f32i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
-; CHECK-NOV-NEXT:    fcvt.w.s a1, fa3, rtz
-; CHECK-NOV-NEXT:    lui a3, 16
-; CHECK-NOV-NEXT:    addiw a3, a3, -1
 ; CHECK-NOV-NEXT:    fcvt.w.s a2, fa2, rtz
-; CHECK-NOV-NEXT:    bge a1, a3, .LBB41_6
+; CHECK-NOV-NEXT:    fcvt.w.s a1, fa3, rtz
+; CHECK-NOV-NEXT:    lui a4, 16
+; CHECK-NOV-NEXT:    addiw a4, a4, -1
+; CHECK-NOV-NEXT:    bge a1, a4, .LBB41_6
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    fcvt.w.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    bge a2, a3, .LBB41_7
+; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
+; CHECK-NOV-NEXT:    bge a2, a4, .LBB41_7
 ; CHECK-NOV-NEXT:  .LBB41_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT:    bge a4, a3, .LBB41_8
+; CHECK-NOV-NEXT:    bge a3, a4, .LBB41_8
 ; CHECK-NOV-NEXT:  .LBB41_3: # %entry
-; CHECK-NOV-NEXT:    blt a5, a3, .LBB41_5
+; CHECK-NOV-NEXT:    blt a5, a4, .LBB41_5
 ; CHECK-NOV-NEXT:  .LBB41_4: # %entry
-; CHECK-NOV-NEXT:    mv a5, a3
+; CHECK-NOV-NEXT:    mv a5, a4
 ; CHECK-NOV-NEXT:  .LBB41_5: # %entry
-; CHECK-NOV-NEXT:    sgtz a3, a5
-; CHECK-NOV-NEXT:    negw a3, a3
-; CHECK-NOV-NEXT:    and a3, a3, a5
-; CHECK-NOV-NEXT:    sgtz a5, a4
+; CHECK-NOV-NEXT:    sgtz a4, a5
+; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    and a4, a4, a5
+; CHECK-NOV-NEXT:    sgtz a5, a3
 ; CHECK-NOV-NEXT:    negw a5, a5
-; CHECK-NOV-NEXT:    and a4, a5, a4
+; CHECK-NOV-NEXT:    and a3, a5, a3
 ; CHECK-NOV-NEXT:    sgtz a5, a2
 ; CHECK-NOV-NEXT:    negw a5, a5
 ; CHECK-NOV-NEXT:    and a2, a5, a2
 ; CHECK-NOV-NEXT:    sgtz a5, a1
 ; CHECK-NOV-NEXT:    negw a5, a5
 ; CHECK-NOV-NEXT:    and a1, a5, a1
-; CHECK-NOV-NEXT:    sh a3, 0(a0)
-; CHECK-NOV-NEXT:    sh a4, 2(a0)
+; CHECK-NOV-NEXT:    sh a4, 0(a0)
+; CHECK-NOV-NEXT:    sh a3, 2(a0)
 ; CHECK-NOV-NEXT:    sh a2, 4(a0)
 ; CHECK-NOV-NEXT:    sh a1, 6(a0)
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB41_6: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
-; CHECK-NOV-NEXT:    fcvt.w.s a4, fa1, rtz
-; CHECK-NOV-NEXT:    blt a2, a3, .LBB41_2
+; CHECK-NOV-NEXT:    mv a1, a4
+; CHECK-NOV-NEXT:    fcvt.w.s a3, fa1, rtz
+; CHECK-NOV-NEXT:    blt a2, a4, .LBB41_2
 ; CHECK-NOV-NEXT:  .LBB41_7: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a2, a4
 ; CHECK-NOV-NEXT:    fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT:    blt a4, a3, .LBB41_3
+; CHECK-NOV-NEXT:    blt a3, a4, .LBB41_3
 ; CHECK-NOV-NEXT:  .LBB41_8: # %entry
-; CHECK-NOV-NEXT:    mv a4, a3
-; CHECK-NOV-NEXT:    bge a5, a3, .LBB41_4
+; CHECK-NOV-NEXT:    mv a3, a4
+; CHECK-NOV-NEXT:    bge a5, a4, .LBB41_4
 ; CHECK-NOV-NEXT:    j .LBB41_5
 ;
 ; CHECK-V-LABEL: ustest_f32i16_mm:
@@ -4790,16 +4790,16 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
-; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu a0, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -4820,8 +4820,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a7, 8
@@ -5000,7 +5000,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5008,6 +5007,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5025,7 +5025,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5033,6 +5032,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5057,11 +5057,11 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5076,11 +5076,11 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5172,16 +5172,16 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 0(a1)
-; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a0, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
@@ -5202,8 +5202,8 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a3, 16
@@ -5340,7 +5340,6 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5348,6 +5347,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5365,7 +5365,6 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5373,6 +5372,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5397,11 +5397,11 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5416,11 +5416,11 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5511,16 +5511,16 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    .cfi_remember_state
+; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    lhu s1, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 40(a1)
-; CHECK-NOV-NEXT:    lhu a2, 48(a1)
+; CHECK-NOV-NEXT:    lhu a0, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 56(a1)
 ; CHECK-NOV-NEXT:    lhu s4, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 8(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 24(a1)
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -5541,8 +5541,8 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-NOV-NEXT:    lui a3, 16
@@ -5703,7 +5703,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5711,6 +5710,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5728,7 +5728,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -5736,6 +5735,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5760,11 +5760,11 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -5779,11 +5779,11 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
index 195ffc50594c3..9e8cd85739183 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
@@ -466,8 +466,8 @@ define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float>
 ; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    srl a0, a0, a2
 ; CHECK-NEXT:    andi a0, a0, 7
-; CHECK-NEXT:    vfadd.vv v8, v8, v8
 ; CHECK-NEXT:    sw a0, 0(a1)
+; CHECK-NEXT:    vfadd.vv v8, v8, v8
 ; CHECK-NEXT:    ret
 ;
 ; UNOPT-LABEL: test5:
@@ -482,8 +482,8 @@ define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float>
 ; UNOPT-NEXT:    slli a2, a2, 2
 ; UNOPT-NEXT:    srl a0, a0, a2
 ; UNOPT-NEXT:    andi a0, a0, 7
-; UNOPT-NEXT:    vfadd.vv v8, v8, v8
 ; UNOPT-NEXT:    sw a0, 0(a1)
+; UNOPT-NEXT:    vfadd.vv v8, v8, v8
 ; UNOPT-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfadd.nxv1f32.nxv1f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 3d992aa13e379..15ba3850de23d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
@@ -12,11 +12,11 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -35,11 +35,11 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -58,11 +58,11 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -81,11 +81,11 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -104,11 +104,11 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -127,11 +127,11 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -152,9 +152,9 @@ define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -175,9 +175,9 @@ define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -198,9 +198,9 @@ define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -221,9 +221,9 @@ define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -244,9 +244,9 @@ define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -265,11 +265,11 @@ define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -288,11 +288,11 @@ define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -311,11 +311,11 @@ define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -334,11 +334,11 @@ define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
index f7422b279149f..323a22a89bf7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
@@ -20,11 +20,11 @@ define <vscale x 1 x bfloat> @round_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -43,11 +43,11 @@ define <vscale x 2 x bfloat> @round_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -66,11 +66,11 @@ define <vscale x 4 x bfloat> @round_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -89,11 +89,11 @@ define <vscale x 8 x bfloat> @round_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -112,11 +112,11 @@ define <vscale x 16 x bfloat> @round_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -135,11 +135,11 @@ define <vscale x 32 x bfloat> @round_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -169,12 +169,12 @@ define <vscale x 32 x bfloat> @round_nxv32bf16(<vscale x 32 x bfloat> %x) {
 define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFH-LABEL: round_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -187,11 +187,11 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -208,12 +208,12 @@ declare <vscale x 1 x half> @llvm.round.nxv1f16(<vscale x 1 x half>)
 define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFH-LABEL: round_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -226,11 +226,11 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -247,12 +247,12 @@ declare <vscale x 2 x half> @llvm.round.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFH-LABEL: round_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -265,11 +265,11 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -286,12 +286,12 @@ declare <vscale x 4 x half> @llvm.round.nxv4f16(<vscale x 4 x half>)
 define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFH-LABEL: round_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -304,11 +304,11 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -325,12 +325,12 @@ declare <vscale x 8 x half> @llvm.round.nxv8f16(<vscale x 8 x half>)
 define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFH-LABEL: round_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -343,11 +343,11 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -364,12 +364,12 @@ declare <vscale x 16 x half> @llvm.round.nxv16f16(<vscale x 16 x half>)
 define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFH-LABEL: round_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -382,11 +382,11 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -421,8 +421,8 @@ define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -441,8 +441,8 @@ define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -461,8 +461,8 @@ define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -481,8 +481,8 @@ define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -501,8 +501,8 @@ define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -517,12 +517,12 @@ declare <vscale x 16 x float> @llvm.round.nxv16f32(<vscale x 16 x float>)
 define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-LABEL: round_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -537,12 +537,12 @@ declare <vscale x 1 x double> @llvm.round.nxv1f64(<vscale x 1 x double>)
 define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-LABEL: round_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -557,12 +557,12 @@ declare <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: round_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -577,12 +577,12 @@ declare <vscale x 4 x double> @llvm.round.nxv4f64(<vscale x 4 x double>)
 define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-LABEL: round_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
index c293ac91b63bf..6cd6eef99a9ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
@@ -12,11 +12,11 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -35,11 +35,11 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -58,11 +58,11 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -81,11 +81,11 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -104,11 +104,11 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -127,11 +127,11 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictf
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -152,9 +152,9 @@ define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -175,9 +175,9 @@ define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -198,9 +198,9 @@ define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -221,9 +221,9 @@ define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -244,9 +244,9 @@ define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) stric
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -265,11 +265,11 @@ define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -288,11 +288,11 @@ define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -311,11 +311,11 @@ define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -334,11 +334,11 @@ define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strict
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
index 865531b77eb29..903345dca1af2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
@@ -19,11 +19,11 @@ define <vscale x 1 x bfloat> @roundeven_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -42,11 +42,11 @@ define <vscale x 2 x bfloat> @roundeven_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -65,11 +65,11 @@ define <vscale x 4 x bfloat> @roundeven_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -88,11 +88,11 @@ define <vscale x 8 x bfloat> @roundeven_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -111,11 +111,11 @@ define <vscale x 16 x bfloat> @roundeven_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -134,11 +134,11 @@ define <vscale x 32 x bfloat> @roundeven_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -168,12 +168,12 @@ define <vscale x 32 x bfloat> @roundeven_nxv32bf16(<vscale x 32 x bfloat> %x) {
 define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -186,11 +186,11 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -207,12 +207,12 @@ declare <vscale x 1 x half> @llvm.roundeven.nxv1f16(<vscale x 1 x half>)
 define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI7_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -225,11 +225,11 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -246,12 +246,12 @@ declare <vscale x 2 x half> @llvm.roundeven.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI8_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -264,11 +264,11 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -285,12 +285,12 @@ declare <vscale x 4 x half> @llvm.roundeven.nxv4f16(<vscale x 4 x half>)
 define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv8f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI9_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -303,11 +303,11 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -324,12 +324,12 @@ declare <vscale x 8 x half> @llvm.roundeven.nxv8f16(<vscale x 8 x half>)
 define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv16f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI10_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -342,11 +342,11 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -363,12 +363,12 @@ declare <vscale x 16 x half> @llvm.roundeven.nxv16f16(<vscale x 16 x half>)
 define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFH-LABEL: roundeven_nxv32f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI11_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -381,11 +381,11 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -420,8 +420,8 @@ define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -440,8 +440,8 @@ define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -460,8 +460,8 @@ define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -480,8 +480,8 @@ define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -500,8 +500,8 @@ define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -516,12 +516,12 @@ declare <vscale x 16 x float> @llvm.roundeven.nxv16f32(<vscale x 16 x float>)
 define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-LABEL: roundeven_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI17_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -536,12 +536,12 @@ declare <vscale x 1 x double> @llvm.roundeven.nxv1f64(<vscale x 1 x double>)
 define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-LABEL: roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -556,12 +556,12 @@ declare <vscale x 2 x double> @llvm.roundeven.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -576,12 +576,12 @@ declare <vscale x 4 x double> @llvm.roundeven.nxv4f64(<vscale x 4 x double>)
 define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-LABEL: roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index b569efc7447da..f52200b4e7c34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -984,10 +984,10 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -1001,43 +1001,33 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    sub a5, a4, a1
-; CHECK-NEXT:    add a6, a2, a3
-; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    sltu a6, a4, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    srli a6, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v16, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
-; CHECK-NEXT:    li a3, 63
+; CHECK-NEXT:    slli a5, a3, 3
+; CHECK-NEXT:    srli a1, a3, 3
+; CHECK-NEXT:    sub a6, a4, a3
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    add a1, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    sltu a1, a4, a6
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a6, a1, a6
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a6, a6, a7
+; CHECK-NEXT:    slli a6, a6, 4
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v16, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    slli a5, a5, 3
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    slli a5, a5, 4
 ; CHECK-NEXT:    add a5, sp, a5
@@ -1048,21 +1038,35 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    slli a5, a5, 4
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 40
+; CHECK-NEXT:    li a6, 24
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 24
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1071,66 +1075,40 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a4, a1, .LBB46_2
+; CHECK-NEXT:    bltu a4, a3, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a1
+; CHECK-NEXT:    mv a4, a3
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v16, a3, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1151,7 +1129,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -1176,13 +1154,13 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1193,7 +1171,10 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    add a1, a2, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a1, a4, a6
 ; CHECK-NEXT:    addi a1, a1, -1
@@ -1207,7 +1188,8 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    li a7, 24
+; CHECK-NEXT:    mul a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
@@ -1218,17 +1200,34 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    li a7, 24
+; CHECK-NEXT:    mul a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a5, a0, a5
-; CHECK-NEXT:    addi a6, sp, 16
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vl8re64.v v16, (a5)
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vl8re64.v v8, (a5)
+; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 24
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 24
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1237,59 +1236,37 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a3
 ; CHECK-NEXT:  .LBB47_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
@@ -1301,14 +1278,14 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
index 4d47c265a9747..625fc0d8fe087 100644
--- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
@@ -331,12 +331,12 @@ define <vscale x 1 x i32> @ceil_nxv1f16_to_ui32(<vscale x 1 x half> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f16_to_si64(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_si64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI22_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -388,12 +388,12 @@ define <vscale x 1 x i64> @ceil_nxv1f16_to_si64(<vscale x 1 x half> %x) {
 define <vscale x 1 x i64> @ceil_nxv1f16_to_ui64(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: ceil_nxv1f16_to_ui64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -533,12 +533,12 @@ define <vscale x 4 x i32> @ceil_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f16_to_si64(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_si64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI30_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI30_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI30_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -590,12 +590,12 @@ define <vscale x 4 x i64> @ceil_nxv4f16_to_si64(<vscale x 4 x half> %x) {
 define <vscale x 4 x i64> @ceil_nxv4f16_to_ui64(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: ceil_nxv4f16_to_ui64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI31_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI31_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI31_0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI31_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/localvar.ll b/llvm/test/CodeGen/RISCV/rvv/localvar.ll
index ad8fde013ce08..fb7cd0072efa9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/localvar.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/localvar.ll
@@ -223,14 +223,14 @@ define void @local_var_m2_with_varsize_object(i64 %n) {
 ; RV64IV-NEXT:    andi a0, a0, -16
 ; RV64IV-NEXT:    sub a0, sp, a0
 ; RV64IV-NEXT:    mv sp, a0
-; RV64IV-NEXT:    csrr a1, vlenb
-; RV64IV-NEXT:    slli a1, a1, 1
-; RV64IV-NEXT:    sub a1, s0, a1
-; RV64IV-NEXT:    addi a1, a1, -32
 ; RV64IV-NEXT:    csrr s1, vlenb
 ; RV64IV-NEXT:    slli s1, s1, 1
 ; RV64IV-NEXT:    sub s1, s0, s1
 ; RV64IV-NEXT:    addi s1, s1, -32
+; RV64IV-NEXT:    csrr a1, vlenb
+; RV64IV-NEXT:    slli a1, a1, 1
+; RV64IV-NEXT:    sub a1, s0, a1
+; RV64IV-NEXT:    addi a1, a1, -32
 ; RV64IV-NEXT:    call notdead
 ; RV64IV-NEXT:    vl2r.v v8, (s1)
 ; RV64IV-NEXT:    csrr a0, vlenb
@@ -282,15 +282,15 @@ define void @local_var_m2_with_bp(i64 %n) {
 ; RV64IV-NEXT:    andi a0, a0, -16
 ; RV64IV-NEXT:    sub a0, sp, a0
 ; RV64IV-NEXT:    mv sp, a0
+; RV64IV-NEXT:    csrr s2, vlenb
+; RV64IV-NEXT:    slli s2, s2, 1
+; RV64IV-NEXT:    add s2, s1, s2
+; RV64IV-NEXT:    addi s2, s2, 224
 ; RV64IV-NEXT:    addi a1, s1, 128
 ; RV64IV-NEXT:    csrr a2, vlenb
 ; RV64IV-NEXT:    slli a2, a2, 1
 ; RV64IV-NEXT:    add a2, s1, a2
 ; RV64IV-NEXT:    addi a2, a2, 224
-; RV64IV-NEXT:    csrr s2, vlenb
-; RV64IV-NEXT:    slli s2, s2, 1
-; RV64IV-NEXT:    add s2, s1, s2
-; RV64IV-NEXT:    addi s2, s2, 224
 ; RV64IV-NEXT:    call notdead2
 ; RV64IV-NEXT:    lw zero, 124(s1)
 ; RV64IV-NEXT:    vl2r.v v8, (s2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
index 2553f563b7d0f..85b04f177f66f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
@@ -137,12 +137,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    sb a2, 6(a0)
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
+; RV32-NEXT:    addi a2, a0, 4
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    addi a1, a1, 4
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    vse8.v v8, (a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: unaligned_memcpy7:
@@ -151,12 +151,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    sb a2, 6(a0)
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
+; RV64-NEXT:    addi a2, a0, 4
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    addi a1, a1, 4
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    vse8.v v8, (a0)
+; RV64-NEXT:    vse8.v v8, (a2)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: unaligned_memcpy7:
@@ -223,11 +223,11 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a2)
 ; RV32-NEXT:    addi a2, a0, 12
+; RV32-NEXT:    addi a0, a0, 8
 ; RV32-NEXT:    vse8.v v8, (a2)
 ; RV32-NEXT:    addi a1, a1, 8
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 8
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -242,11 +242,11 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a2)
 ; RV64-NEXT:    addi a2, a0, 12
+; RV64-NEXT:    addi a0, a0, 8
 ; RV64-NEXT:    vse8.v v8, (a2)
 ; RV64-NEXT:    addi a1, a1, 8
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    addi a0, a0, 8
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
@@ -312,9 +312,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
 ; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 15
 ; RV32-NEXT:    addi a1, a1, 15
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 15
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -323,9 +323,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
 ; RV64-NEXT:    vse8.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 15
 ; RV64-NEXT:    addi a1, a1, 15
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    addi a0, a0, 15
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
@@ -334,9 +334,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
+; RV32-FAST-NEXT:    addi a0, a0, 15
 ; RV32-FAST-NEXT:    addi a1, a1, 15
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
-; RV32-FAST-NEXT:    addi a0, a0, 15
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
 ; RV32-FAST-NEXT:    ret
 ;
@@ -345,9 +345,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
+; RV64-FAST-NEXT:    addi a0, a0, 15
 ; RV64-FAST-NEXT:    addi a1, a1, 15
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
-; RV64-FAST-NEXT:    addi a0, a0, 15
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -459,10 +459,10 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
+; RV32-FAST-NEXT:    addi a0, a0, 64
 ; RV32-FAST-NEXT:    addi a1, a1, 64
 ; RV32-FAST-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
-; RV32-FAST-NEXT:    addi a0, a0, 64
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
 ; RV32-FAST-NEXT:    ret
 ;
@@ -471,10 +471,10 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
+; RV64-FAST-NEXT:    addi a0, a0, 64
 ; RV64-FAST-NEXT:    addi a1, a1, 64
 ; RV64-FAST-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
-; RV64-FAST-NEXT:    addi a0, a0, 64
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -568,12 +568,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    sw a2, 192(a0)
 ; RV32-FAST-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
+; RV32-FAST-NEXT:    addi a2, a0, 128
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
-; RV32-FAST-NEXT:    addi a1, a1, 128
+; RV32-FAST-NEXT:    addi a0, a1, 128
 ; RV32-FAST-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-FAST-NEXT:    vle64.v v8, (a1)
-; RV32-FAST-NEXT:    addi a0, a0, 128
-; RV32-FAST-NEXT:    vse64.v v8, (a0)
+; RV32-FAST-NEXT:    vle64.v v8, (a0)
+; RV32-FAST-NEXT:    vse64.v v8, (a2)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memcpy196:
@@ -582,12 +582,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-NEXT:    sw a2, 192(a0)
 ; RV64-FAST-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
+; RV64-FAST-NEXT:    addi a2, a0, 128
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
-; RV64-FAST-NEXT:    addi a1, a1, 128
+; RV64-FAST-NEXT:    addi a0, a1, 128
 ; RV64-FAST-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-FAST-NEXT:    vle64.v v8, (a1)
-; RV64-FAST-NEXT:    addi a0, a0, 128
-; RV64-FAST-NEXT:    vse64.v v8, (a0)
+; RV64-FAST-NEXT:    vle64.v v8, (a0)
+; RV64-FAST-NEXT:    vse64.v v8, (a2)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 196, i1 false)
@@ -624,9 +624,9 @@ define void @unaligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
+; RV32-FAST-NEXT:    addi a0, a0, 128
 ; RV32-FAST-NEXT:    addi a1, a1, 128
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
-; RV32-FAST-NEXT:    addi a0, a0, 128
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
 ; RV32-FAST-NEXT:    ret
 ;
@@ -635,9 +635,9 @@ define void @unaligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
+; RV64-FAST-NEXT:    addi a0, a0, 128
 ; RV64-FAST-NEXT:    addi a1, a1, 128
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
-; RV64-FAST-NEXT:    addi a0, a0, 128
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -837,10 +837,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a1)
 ; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 15
 ; RV32-NEXT:    addi a1, a1, 15
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 15
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
@@ -849,10 +849,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    vse64.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 15
 ; RV64-NEXT:    addi a1, a1, 15
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    addi a0, a0, 15
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
@@ -861,9 +861,9 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-FAST-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
+; RV32-FAST-NEXT:    addi a0, a0, 15
 ; RV32-FAST-NEXT:    addi a1, a1, 15
 ; RV32-FAST-NEXT:    vle64.v v8, (a1)
-; RV32-FAST-NEXT:    addi a0, a0, 15
 ; RV32-FAST-NEXT:    vse64.v v8, (a0)
 ; RV32-FAST-NEXT:    ret
 ;
@@ -872,9 +872,9 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-FAST-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
+; RV64-FAST-NEXT:    addi a0, a0, 15
 ; RV64-FAST-NEXT:    addi a1, a1, 15
 ; RV64-FAST-NEXT:    vle64.v v8, (a1)
-; RV64-FAST-NEXT:    addi a0, a0, 15
 ; RV64-FAST-NEXT:    vse64.v v8, (a0)
 ; RV64-FAST-NEXT:    ret
 entry:
@@ -926,10 +926,10 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-BOTH-NEXT:    vle64.v v8, (a1)
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
+; RV32-BOTH-NEXT:    addi a0, a0, 64
 ; RV32-BOTH-NEXT:    addi a1, a1, 64
 ; RV32-BOTH-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-BOTH-NEXT:    vle64.v v8, (a1)
-; RV32-BOTH-NEXT:    addi a0, a0, 64
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -938,10 +938,10 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-BOTH-NEXT:    vle64.v v8, (a1)
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
+; RV64-BOTH-NEXT:    addi a0, a0, 64
 ; RV64-BOTH-NEXT:    addi a1, a1, 64
 ; RV64-BOTH-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-BOTH-NEXT:    vle64.v v8, (a1)
-; RV64-BOTH-NEXT:    addi a0, a0, 64
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV64-BOTH-NEXT:    ret
 entry:
@@ -975,12 +975,12 @@ define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-BOTH-NEXT:    sw a2, 192(a0)
 ; RV32-BOTH-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-BOTH-NEXT:    vle64.v v8, (a1)
+; RV32-BOTH-NEXT:    addi a2, a0, 128
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
-; RV32-BOTH-NEXT:    addi a1, a1, 128
+; RV32-BOTH-NEXT:    addi a0, a1, 128
 ; RV32-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-BOTH-NEXT:    vle64.v v8, (a1)
-; RV32-BOTH-NEXT:    addi a0, a0, 128
-; RV32-BOTH-NEXT:    vse64.v v8, (a0)
+; RV32-BOTH-NEXT:    vle64.v v8, (a0)
+; RV32-BOTH-NEXT:    vse64.v v8, (a2)
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-BOTH-LABEL: aligned_memcpy196:
@@ -989,12 +989,12 @@ define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-BOTH-NEXT:    sw a2, 192(a0)
 ; RV64-BOTH-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-BOTH-NEXT:    vle64.v v8, (a1)
+; RV64-BOTH-NEXT:    addi a2, a0, 128
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
-; RV64-BOTH-NEXT:    addi a1, a1, 128
+; RV64-BOTH-NEXT:    addi a0, a1, 128
 ; RV64-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-BOTH-NEXT:    vle64.v v8, (a1)
-; RV64-BOTH-NEXT:    addi a0, a0, 128
-; RV64-BOTH-NEXT:    vse64.v v8, (a0)
+; RV64-BOTH-NEXT:    vle64.v v8, (a0)
+; RV64-BOTH-NEXT:    vse64.v v8, (a2)
 ; RV64-BOTH-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 196, i1 false)
@@ -1007,9 +1007,9 @@ define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-BOTH-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-BOTH-NEXT:    vle64.v v8, (a1)
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
+; RV32-BOTH-NEXT:    addi a0, a0, 128
 ; RV32-BOTH-NEXT:    addi a1, a1, 128
 ; RV32-BOTH-NEXT:    vle64.v v8, (a1)
-; RV32-BOTH-NEXT:    addi a0, a0, 128
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -1018,9 +1018,9 @@ define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-BOTH-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-BOTH-NEXT:    vle64.v v8, (a1)
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
+; RV64-BOTH-NEXT:    addi a0, a0, 128
 ; RV64-BOTH-NEXT:    addi a1, a1, 128
 ; RV64-BOTH-NEXT:    vle64.v v8, (a1)
-; RV64-BOTH-NEXT:    addi a0, a0, 128
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV64-BOTH-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
index 8190a82d7035b..f4502ee0fa8f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
@@ -55,9 +55,9 @@ define <vscale x 64 x i8> @caller() {
 ; RV64IV-NEXT:    add a0, sp, a0
 ; RV64IV-NEXT:    addi a0, a0, 64
 ; RV64IV-NEXT:    vl8r.v v24, (a0)
-; RV64IV-NEXT:    addi a1, sp, 64
 ; RV64IV-NEXT:    addi a0, sp, 64
-; RV64IV-NEXT:    vs8r.v v24, (a1)
+; RV64IV-NEXT:    vs8r.v v24, (a0)
+; RV64IV-NEXT:    addi a0, sp, 64
 ; RV64IV-NEXT:    call callee
 ; RV64IV-NEXT:    addi sp, s0, -80
 ; RV64IV-NEXT:    .cfi_def_cfa sp, 80
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 0fad09f27007c..893658ebb1901 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -1221,12 +1221,12 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vl8re64.v v24, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
+; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
 ; RV32-NEXT:    srli a2, a0, 3
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v7, v0, a2
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
-; RV32-NEXT:    vluxei32.v v16, (zero), v8, v0.t
-; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    vluxei32.v v24, (zero), v12, v0.t
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, a1, a0
@@ -1236,37 +1236,20 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
 ;
 ; RV64-LABEL: mgather_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 3
-; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vmv8r.v v16, v8
 ; RV64-NEXT:    vl8re64.v v24, (a0)
-; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v24, (zero), v8, v0.t
 ; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a1, a0, 3
-; RV64-NEXT:    vslidedown.vx v7, v0, a1
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vluxei64.v v24, (zero), v16, v0.t
-; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vluxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vs8r.v v24, (a2)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %p0 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> undef, <vscale x 8 x ptr> %ptrs0, i64 0)
   %p1 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> %p0, <vscale x 8 x ptr> %ptrs1, i64 8)
@@ -2347,12 +2330,12 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64-NEXT:    vluxei64.v v12, (a0), v24, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v24, v9
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v24, v9
-; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; RV64-NEXT:    vsetvli a3, zero, e8, m1, ta, mu
 ; RV64-NEXT:    vluxei64.v v13, (a0), v24, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 3cf7cc9cb5152..cd6f76a79373f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1904,57 +1904,25 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 24
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re64.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8re64.v v16, (a1)
+; RV64-NEXT:    vl8re64.v v24, (a0)
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    vl8re64.v v8, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vslidedown.vx v0, v0, a0
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
index dc34a49e56aa5..d0fe5ca46b72f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
@@ -17,10 +17,10 @@ define dso_local void @test(ptr nocapture noundef %var_99) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    csrwi vxrm, 0
 ; CHECK-NEXT:    vmul.vx v16, v8, a1
+; CHECK-NEXT:    vssra.vv v12, v12, v8
 ; CHECK-NEXT:    vmv.x.s a1, v16
 ; CHECK-NEXT:    vmsleu.vx v0, v8, a1
-; CHECK-NEXT:    vssra.vv v8, v12, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v12, v0
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 9298e8b520bd9..ffbc6d4b332f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -1943,19 +1943,18 @@ define <vscale x 3 x i64> @reverse_nxv3i64(<vscale x 3 x i64> %a) {
 ; CHECK-LABEL: reverse_nxv3i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v14, v12, a0
-; CHECK-NEXT:    vrgather.vv v13, v10, v14
-; CHECK-NEXT:    vrgather.vv v10, v9, v14
-; CHECK-NEXT:    vmv.v.v v12, v13
-; CHECK-NEXT:    vrgather.vv v15, v8, v14
-; CHECK-NEXT:    vmv.v.v v13, v10
-; CHECK-NEXT:    vrgather.vv v8, v11, v14
-; CHECK-NEXT:    vmv.v.v v14, v15
-; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v12, v12, a0
+; CHECK-NEXT:    vrgather.vv v15, v8, v12
+; CHECK-NEXT:    vrgather.vv v14, v9, v12
+; CHECK-NEXT:    vrgather.vv v9, v10, v12
+; CHECK-NEXT:    vrgather.vv v8, v11, v12
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vmv.v.v v9, v14
+; CHECK-NEXT:    vmv.v.v v10, v15
 ; CHECK-NEXT:    ret
   %res = call <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
   ret <vscale x 3 x i64> %res
@@ -1969,19 +1968,18 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vrsub.vx v22, v16, a0
-; CHECK-NEXT:    vrgather.vv v21, v10, v22
-; CHECK-NEXT:    vrgather.vv v19, v12, v22
-; CHECK-NEXT:    vrgather.vv v18, v13, v22
-; CHECK-NEXT:    vrgather.vv v20, v11, v22
-; CHECK-NEXT:    vmv2r.v v16, v18
-; CHECK-NEXT:    vmv2r.v v18, v20
-; CHECK-NEXT:    vrgather.vv v31, v8, v22
-; CHECK-NEXT:    vrgather.vv v30, v9, v22
-; CHECK-NEXT:    vrgather.vv v9, v14, v22
-; CHECK-NEXT:    vrgather.vv v8, v15, v22
-; CHECK-NEXT:    vmv2r.v v20, v30
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vrsub.vx v16, v16, a0
+; CHECK-NEXT:    vrgather.vv v23, v8, v16
+; CHECK-NEXT:    vrgather.vv v21, v10, v16
+; CHECK-NEXT:    vrgather.vv v22, v9, v16
+; CHECK-NEXT:    vrgather.vv v20, v11, v16
+; CHECK-NEXT:    vrgather.vv v11, v12, v16
+; CHECK-NEXT:    vrgather.vv v10, v13, v16
+; CHECK-NEXT:    vrgather.vv v9, v14, v16
+; CHECK-NEXT:    vrgather.vv v8, v15, v16
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv2r.v v10, v20
+; CHECK-NEXT:    vmv2r.v v12, v22
 ; CHECK-NEXT:    ret
   %res = call <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
   ret <vscale x 6 x i64> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 7d3700492ea7b..280cc03a01791 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -22,22 +22,22 @@ define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16(<vscale x 1 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -49,18 +49,18 @@ define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16_unmasked(<vscale x 1 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x bfloat> %v
@@ -76,22 +76,22 @@ define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16(<vscale x 2 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v11, v11, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -103,18 +103,18 @@ define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16_unmasked(<vscale x 2 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x bfloat> %v
@@ -130,22 +130,22 @@ define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16(<vscale x 4 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -157,18 +157,18 @@ define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16_unmasked(<vscale x 4 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x bfloat> %v
@@ -184,22 +184,22 @@ define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16(<vscale x 8 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -211,18 +211,18 @@ define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16_unmasked(<vscale x 8 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x bfloat> %v
@@ -238,22 +238,22 @@ define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16(<vscale x 16 x bfloat> %va
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -265,18 +265,18 @@ define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16_unmasked(<vscale x 16 x bf
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    frflags a4
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,12 +316,11 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT:    fsflags a2
+; CHECK-NEXT:    fsflags a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v17
@@ -336,21 +336,21 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    frflags a4
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,12 +395,11 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT:    fsflags a2
+; CHECK-NEXT:    fsflags a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
@@ -413,17 +413,17 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -439,19 +439,19 @@ declare <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half>, <vsc
 define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16:
@@ -461,22 +461,22 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -485,17 +485,17 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16_unmasked:
@@ -503,18 +503,18 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -525,19 +525,19 @@ declare <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half>, <vsc
 define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16:
@@ -547,22 +547,22 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v11, v11, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -571,17 +571,17 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16_unmasked:
@@ -589,18 +589,18 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -611,19 +611,19 @@ declare <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half>, <vsc
 define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16:
@@ -633,22 +633,22 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -657,17 +657,17 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16_unmasked:
@@ -675,18 +675,18 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -699,19 +699,19 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16:
@@ -721,22 +721,22 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -745,17 +745,17 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16_unmasked:
@@ -763,18 +763,18 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -787,19 +787,19 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16:
@@ -809,22 +809,22 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -833,17 +833,17 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16_unmasked:
@@ -851,18 +851,18 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -875,19 +875,19 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    frflags a4
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,12 +921,11 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a2
+; ZVFHMIN-NEXT:    fsflags a4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
@@ -941,21 +941,21 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -970,17 +970,17 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    frflags a0
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    frflags a4
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,12 +1015,11 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    frflags a2
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a2
+; ZVFHMIN-NEXT:    fsflags a4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
@@ -1033,17 +1033,17 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -1064,15 +1064,15 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32(<vscale x 1 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x float> %v
@@ -1085,13 +1085,13 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32_unmasked(<vscale x 1 x float>
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x float> %v
@@ -1106,15 +1106,15 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32(<vscale x 2 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x float> %v
@@ -1127,13 +1127,13 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
@@ -1149,16 +1149,16 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32(<vscale x 4 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -1171,13 +1171,13 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32_unmasked(<vscale x 4 x float>
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x float> %v
@@ -1193,16 +1193,16 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32(<vscale x 8 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -1215,13 +1215,13 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32_unmasked(<vscale x 8 x float>
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x float> %v
@@ -1237,16 +1237,16 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32(<vscale x 16 x float> %va, <
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -1259,13 +1259,13 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32_unmasked(<vscale x 16 x floa
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x float> %v
@@ -1276,19 +1276,19 @@ declare <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double>,
 define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x double> %v
@@ -1297,17 +1297,17 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <v
 define <vscale x 1 x double> @vp_nearbyint_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x double> %v
@@ -1320,19 +1320,19 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -1341,17 +1341,17 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <v
 define <vscale x 2 x double> @vp_nearbyint_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
@@ -1364,19 +1364,19 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -1385,17 +1385,17 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <v
 define <vscale x 4 x double> @vp_nearbyint_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x double> %v
@@ -1408,19 +1408,19 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -1429,17 +1429,17 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <v
 define <vscale x 7 x double> @vp_nearbyint_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 7 x double> %v
@@ -1452,19 +1452,19 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1473,17 +1473,17 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <v
 define <vscale x 8 x double> @vp_nearbyint_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1495,8 +1495,15 @@ declare <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double
 define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
@@ -1506,36 +1513,46 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    frflags a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    frflags a2
+; CHECK-NEXT:    vmflt.vf v6, v16, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a2
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    fsflags a3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
@@ -1551,13 +1568,13 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    frflags a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a2
+; CHECK-NEXT:    fsflags a3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB45_2
@@ -1566,13 +1583,13 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
index feb96deb920ff..b83439f6baa22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll
@@ -19,24 +19,23 @@ define signext i32 @foo(i32 signext %aa) #0 {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    andi sp, sp, -16
 ; CHECK-NEXT:    mv s1, sp
-; CHECK-NEXT:    lw t0, 44(s1)
+; CHECK-NEXT:    sw a0, 52(s1)
+; CHECK-NEXT:    sw a0, 48(s1)
+; CHECK-NEXT:    lw a0, 44(s1)
 ; CHECK-NEXT:    lw a2, 40(s1)
 ; CHECK-NEXT:    lw a3, 36(s1)
 ; CHECK-NEXT:    lw a4, 32(s1)
 ; CHECK-NEXT:    lw a5, 28(s1)
 ; CHECK-NEXT:    lw a6, 24(s1)
 ; CHECK-NEXT:    lw a7, 20(s1)
-; CHECK-NEXT:    lw t1, 16(s1)
-; CHECK-NEXT:    lw t2, 12(s1)
-; CHECK-NEXT:    lw t3, 8(s1)
-; CHECK-NEXT:    sw a0, 52(s1)
-; CHECK-NEXT:    sw a0, 48(s1)
+; CHECK-NEXT:    lw a1, 16(s1)
+; CHECK-NEXT:    lw t0, 12(s1)
+; CHECK-NEXT:    lw t1, 8(s1)
 ; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    sd a1, 0(sp)
+; CHECK-NEXT:    sd t0, 8(sp)
+; CHECK-NEXT:    sd t1, 16(sp)
 ; CHECK-NEXT:    addi a1, s1, 48
-; CHECK-NEXT:    sd t1, 0(sp)
-; CHECK-NEXT:    sd t2, 8(sp)
-; CHECK-NEXT:    sd t3, 16(sp)
-; CHECK-NEXT:    mv a0, t0
 ; CHECK-NEXT:    call gfunc
 ; CHECK-NEXT:    addi sp, sp, 32
 ; CHECK-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr125306.ll b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll
index 111f87de220db..f3ac76eaace6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr125306.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll
@@ -19,57 +19,57 @@ define <2 x i32> @main(ptr %0) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    sw zero, 80(zero)
-; CHECK-NEXT:    lui a1, 7
+; CHECK-NEXT:    lui a2, 7
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:    lui a5, 2
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a2)
+; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    sh zero, -392(a3)
 ; CHECK-NEXT:    sh zero, 534(a3)
 ; CHECK-NEXT:    sh zero, 1460(a3)
 ; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vse32.v v10, (a2)
-; CHECK-NEXT:    li a2, 40
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v11, -1
 ; CHECK-NEXT:    sh zero, -1710(a5)
 ; CHECK-NEXT:    sh zero, -784(a5)
 ; CHECK-NEXT:    sh zero, 142(a5)
-; CHECK-NEXT:    lw a5, -304(a1)
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v11, -1
 ; CHECK-NEXT:    vse32.v v10, (a3)
 ; CHECK-NEXT:    sh zero, 0(a0)
-; CHECK-NEXT:    lw a0, -188(a1)
-; CHECK-NEXT:    vse32.v v10, (a2)
-; CHECK-NEXT:    lw a2, -188(a1)
-; CHECK-NEXT:    lw a3, 1244(a1)
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    lw a0, 1244(a1)
-; CHECK-NEXT:    lw a1, -304(a1)
-; CHECK-NEXT:    vmv.v.x v10, a3
-; CHECK-NEXT:    vmv.v.x v11, a5
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    lw a0, 1244(a2)
+; CHECK-NEXT:    lw a1, 1244(a2)
+; CHECK-NEXT:    lw a3, -188(a2)
+; CHECK-NEXT:    lw a5, -188(a2)
+; CHECK-NEXT:    vmv.v.x v8, a3
+; CHECK-NEXT:    lw a3, -304(a2)
+; CHECK-NEXT:    lw a2, -304(a2)
+; CHECK-NEXT:    sh zero, 0(zero)
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v11, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
 ; CHECK-NEXT:    vslide1down.vx v10, v10, zero
 ; CHECK-NEXT:    vmin.vv v8, v10, v8
-; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vslide1down.vx v11, v11, zero
+; CHECK-NEXT:    vmin.vx v10, v10, a5
 ; CHECK-NEXT:    vmin.vx v10, v10, a2
-; CHECK-NEXT:    vmin.vx v10, v10, a1
 ; CHECK-NEXT:    vmin.vv v11, v8, v11
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    vand.vv v9, v11, v9
-; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    vse32.v v9, (a4)
-; CHECK-NEXT:    sh zero, 0(zero)
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 entry:
   store <16 x i32> zeroinitializer, ptr null, align 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index dbd4224c7ef08..d09b200485092 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -18,12 +18,11 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    lhu a0, 6(a0)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fmv.w.x fa5, s2
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    fmv.s fa0, fa5
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
@@ -36,12 +35,11 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fmv.w.x fa5, s0
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    fmv.s fa0, fa5
+; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index 06a357eeaeb61..4be681ec51234 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -105,8 +105,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    addi s8, s8, 4
 ; CHECK-NEXT:    addi ra, ra, 4
 ; CHECK-NEXT:    addi a3, a3, 4
-; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:    addi s11, s11, 4
+; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:    beqz s10, .LBB0_4
 ; CHECK-NEXT:  # %bb.7: # %for.cond.cleanup11.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=3
@@ -114,8 +114,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    addi s3, s3, 4
 ; CHECK-NEXT:    addi s7, s7, 4
 ; CHECK-NEXT:    addi s6, s6, 4
-; CHECK-NEXT:    andi a1, a2, 1
 ; CHECK-NEXT:    addi s5, s5, 4
+; CHECK-NEXT:    andi a1, a2, 1
 ; CHECK-NEXT:    beqz a1, .LBB0_3
 ; CHECK-NEXT:  # %bb.8: # %for.cond.cleanup7.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=2
diff --git a/llvm/test/CodeGen/RISCV/rvv/reg-alloc-reserve-bp.ll b/llvm/test/CodeGen/RISCV/rvv/reg-alloc-reserve-bp.ll
index 142fc2b867173..24e37fe327aa5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/reg-alloc-reserve-bp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/reg-alloc-reserve-bp.ll
@@ -28,7 +28,7 @@ define void @foo(ptr nocapture noundef %p1) {
 ; CHECK-NEXT:    addi a0, s1, 160
 ; CHECK-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    addi t0, s1, 64
+; CHECK-NEXT:    addi a7, s1, 64
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    li a1, 2
 ; CHECK-NEXT:    li a2, 3
@@ -36,8 +36,8 @@ define void @foo(ptr nocapture noundef %p1) {
 ; CHECK-NEXT:    li a4, 5
 ; CHECK-NEXT:    li a5, 6
 ; CHECK-NEXT:    li a6, 7
+; CHECK-NEXT:    sd a7, 0(sp)
 ; CHECK-NEXT:    li a7, 8
-; CHECK-NEXT:    sd t0, 0(sp)
 ; CHECK-NEXT:    call bar
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 39744dcecd718..bc9a61f895b2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -22,12 +22,12 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -49,11 +49,11 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -76,12 +76,12 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -103,11 +103,11 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -157,11 +157,11 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -184,12 +184,12 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -211,11 +211,11 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -238,12 +238,12 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -265,11 +265,11 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 4
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,11 +316,10 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 4
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -336,11 +336,11 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 4
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,11 +395,10 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 4
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -413,10 +413,10 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -439,13 +439,13 @@ declare <vscale x 1 x half> @llvm.vp.round.nxv1f16(<vscale x 1 x half>, <vscale
 define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -461,12 +461,12 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -485,12 +485,12 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -503,11 +503,11 @@ define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -525,13 +525,13 @@ declare <vscale x 2 x half> @llvm.vp.round.nxv2f16(<vscale x 2 x half>, <vscale
 define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -547,12 +547,12 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -571,12 +571,12 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -589,11 +589,11 @@ define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -611,13 +611,13 @@ declare <vscale x 4 x half> @llvm.vp.round.nxv4f16(<vscale x 4 x half>, <vscale
 define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -633,12 +633,12 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -657,12 +657,12 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -675,11 +675,11 @@ define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -699,12 +699,12 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -721,12 +721,12 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -745,12 +745,12 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -763,11 +763,11 @@ define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -787,12 +787,12 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -809,12 +809,12 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -833,12 +833,12 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -851,11 +851,11 @@ define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -875,12 +875,12 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 4
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 4
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,11 +921,10 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -941,11 +941,11 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -970,12 +970,12 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 4
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 4
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,11 +1015,10 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -1033,10 +1033,10 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -1064,9 +1064,9 @@ define <vscale x 1 x float> @vp_round_nxv1f32(<vscale x 1 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1085,8 +1085,8 @@ define <vscale x 1 x float> @vp_round_nxv1f32_unmasked(<vscale x 1 x float> %va,
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1106,9 +1106,9 @@ define <vscale x 2 x float> @vp_round_nxv2f32(<vscale x 2 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1127,8 +1127,8 @@ define <vscale x 2 x float> @vp_round_nxv2f32_unmasked(<vscale x 2 x float> %va,
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1149,9 +1149,9 @@ define <vscale x 4 x float> @vp_round_nxv4f32(<vscale x 4 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1171,8 +1171,8 @@ define <vscale x 4 x float> @vp_round_nxv4f32_unmasked(<vscale x 4 x float> %va,
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1193,9 +1193,9 @@ define <vscale x 8 x float> @vp_round_nxv8f32(<vscale x 8 x float> %va, <vscale
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1215,8 +1215,8 @@ define <vscale x 8 x float> @vp_round_nxv8f32_unmasked(<vscale x 8 x float> %va,
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1237,9 +1237,9 @@ define <vscale x 16 x float> @vp_round_nxv16f32(<vscale x 16 x float> %va, <vsca
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1259,8 +1259,8 @@ define <vscale x 16 x float> @vp_round_nxv16f32_unmasked(<vscale x 16 x float> %
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1276,13 +1276,13 @@ declare <vscale x 1 x double> @llvm.vp.round.nxv1f64(<vscale x 1 x double>, <vsc
 define <vscale x 1 x double> @vp_round_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1297,12 +1297,12 @@ define <vscale x 1 x double> @vp_round_nxv1f64(<vscale x 1 x double> %va, <vscal
 define <vscale x 1 x double> @vp_round_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1320,12 +1320,12 @@ define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1341,12 +1341,12 @@ define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscal
 define <vscale x 2 x double> @vp_round_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1364,12 +1364,12 @@ define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1385,12 +1385,12 @@ define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscal
 define <vscale x 4 x double> @vp_round_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1408,12 +1408,12 @@ define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1429,12 +1429,12 @@ define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscal
 define <vscale x 7 x double> @vp_round_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1452,12 +1452,12 @@ define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1473,12 +1473,12 @@ define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscal
 define <vscale x 8 x double> @vp_round_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1498,59 +1498,66 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    fsrmi a3, 4
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1570,12 +1577,12 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    fsrmi a3, 4
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a2, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
@@ -1585,8 +1592,8 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index df5844277c997..d4043fd8b6816 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -22,12 +22,12 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16(<vscale x 1 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -49,11 +49,11 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16_unmasked(<vscale x 1 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -76,12 +76,12 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16(<vscale x 2 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -103,11 +103,11 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16_unmasked(<vscale x 2 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16(<vscale x 4 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -157,11 +157,11 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16_unmasked(<vscale x 4 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -184,12 +184,12 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16(<vscale x 8 x bfloat> %va, <
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -211,11 +211,11 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16_unmasked(<vscale x 8 x bfloa
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -238,12 +238,12 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16(<vscale x 16 x bfloat> %va
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -265,11 +265,11 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16_unmasked(<vscale x 16 x bf
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 0
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,11 +316,10 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -336,11 +336,11 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 0
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,11 +395,10 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -413,10 +413,10 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -439,13 +439,13 @@ declare <vscale x 1 x half> @llvm.vp.roundeven.nxv1f16(<vscale x 1 x half>, <vsc
 define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -461,12 +461,12 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -485,12 +485,12 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -503,11 +503,11 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -525,13 +525,13 @@ declare <vscale x 2 x half> @llvm.vp.roundeven.nxv2f16(<vscale x 2 x half>, <vsc
 define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -547,12 +547,12 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -571,12 +571,12 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -589,11 +589,11 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -611,13 +611,13 @@ declare <vscale x 4 x half> @llvm.vp.roundeven.nxv4f16(<vscale x 4 x half>, <vsc
 define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -633,12 +633,12 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -657,12 +657,12 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -675,11 +675,11 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -699,12 +699,12 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -721,12 +721,12 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -745,12 +745,12 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -763,11 +763,11 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -787,12 +787,12 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -809,12 +809,12 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -833,12 +833,12 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -851,11 +851,11 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -875,12 +875,12 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 0
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 0
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,11 +921,10 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -941,11 +941,11 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -970,12 +970,12 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 0
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 0
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,11 +1015,10 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -1033,10 +1033,10 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -1064,9 +1064,9 @@ define <vscale x 1 x float> @vp_roundeven_nxv1f32(<vscale x 1 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1085,8 +1085,8 @@ define <vscale x 1 x float> @vp_roundeven_nxv1f32_unmasked(<vscale x 1 x float>
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1106,9 +1106,9 @@ define <vscale x 2 x float> @vp_roundeven_nxv2f32(<vscale x 2 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1127,8 +1127,8 @@ define <vscale x 2 x float> @vp_roundeven_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1149,9 +1149,9 @@ define <vscale x 4 x float> @vp_roundeven_nxv4f32(<vscale x 4 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1171,8 +1171,8 @@ define <vscale x 4 x float> @vp_roundeven_nxv4f32_unmasked(<vscale x 4 x float>
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1193,9 +1193,9 @@ define <vscale x 8 x float> @vp_roundeven_nxv8f32(<vscale x 8 x float> %va, <vsc
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1215,8 +1215,8 @@ define <vscale x 8 x float> @vp_roundeven_nxv8f32_unmasked(<vscale x 8 x float>
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1237,9 +1237,9 @@ define <vscale x 16 x float> @vp_roundeven_nxv16f32(<vscale x 16 x float> %va, <
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1259,8 +1259,8 @@ define <vscale x 16 x float> @vp_roundeven_nxv16f32_unmasked(<vscale x 16 x floa
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1276,13 +1276,13 @@ declare <vscale x 1 x double> @llvm.vp.roundeven.nxv1f64(<vscale x 1 x double>,
 define <vscale x 1 x double> @vp_roundeven_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1297,12 +1297,12 @@ define <vscale x 1 x double> @vp_roundeven_nxv1f64(<vscale x 1 x double> %va, <v
 define <vscale x 1 x double> @vp_roundeven_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1320,12 +1320,12 @@ define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1341,12 +1341,12 @@ define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <v
 define <vscale x 2 x double> @vp_roundeven_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1364,12 +1364,12 @@ define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1385,12 +1385,12 @@ define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <v
 define <vscale x 4 x double> @vp_roundeven_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1408,12 +1408,12 @@ define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1429,12 +1429,12 @@ define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <v
 define <vscale x 7 x double> @vp_roundeven_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1452,12 +1452,12 @@ define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1473,12 +1473,12 @@ define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <v
 define <vscale x 8 x double> @vp_roundeven_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1498,59 +1498,66 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    fsrmi a3, 0
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1570,12 +1577,12 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    fsrmi a3, 0
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
@@ -1585,8 +1592,8 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 1300d8cd64ebb..63c35d088d463 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -22,12 +22,12 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16(<vscale x 1 x bfloat> %va,
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -49,11 +49,11 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16_unmasked(<vscale x 1 x bfl
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -76,12 +76,12 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16(<vscale x 2 x bfloat> %va,
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v11, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -103,11 +103,11 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16_unmasked(<vscale x 2 x bfl
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -130,12 +130,12 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16(<vscale x 4 x bfloat> %va,
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -157,11 +157,11 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16_unmasked(<vscale x 4 x bfl
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -184,12 +184,12 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16(<vscale x 8 x bfloat> %va,
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -211,11 +211,11 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16_unmasked(<vscale x 8 x bfl
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -238,12 +238,12 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16(<vscale x 16 x bfloat> %
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -265,11 +265,11 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -297,6 +297,7 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
@@ -315,11 +316,10 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -336,11 +336,11 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -375,11 +375,12 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    lui a3, 307200
+; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    vmv1r.v v17, v16
@@ -394,11 +395,10 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vfabs.v v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -413,10 +413,10 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -439,13 +439,13 @@ declare <vscale x 1 x half> @llvm.vp.roundtozero.nxv1f16(<vscale x 1 x half>, <v
 define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI12_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI12_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI12_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -461,12 +461,12 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -485,12 +485,12 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv1f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI13_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI13_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI13_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -503,11 +503,11 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -525,13 +525,13 @@ declare <vscale x 2 x half> @llvm.vp.roundtozero.nxv2f16(<vscale x 2 x half>, <v
 define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI14_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI14_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -547,12 +547,12 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v11, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v11, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v11, v10, v0.t
@@ -571,12 +571,12 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv2f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI15_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI15_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI15_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -589,11 +589,11 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -611,13 +611,13 @@ declare <vscale x 4 x half> @llvm.vp.roundtozero.nxv4f16(<vscale x 4 x half>, <v
 define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI16_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8, v0.t
+; ZVFH-NEXT:    lui a0, %hi(.LCPI16_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI16_0)(a0)
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
@@ -633,12 +633,12 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v12, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
@@ -657,12 +657,12 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv4f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI17_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
-; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI17_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI17_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -675,11 +675,11 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -699,12 +699,12 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a0)
-; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -721,12 +721,12 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
@@ -745,12 +745,12 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv8f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI19_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v10, v8
-; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI19_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI19_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v10, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -763,11 +763,11 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -787,12 +787,12 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a0)
-; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -809,12 +809,12 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v24, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -833,12 +833,12 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv16f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI21_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8
-; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI21_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI21_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v12, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -851,11 +851,11 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x hal
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    lui a0, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -875,12 +875,12 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a0)
-; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
+; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; ZVFH-NEXT:    fsrmi a0, 1
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -902,6 +902,7 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
@@ -920,11 +921,10 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v18, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v18
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -941,11 +941,11 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v7
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
@@ -970,12 +970,12 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    lui a1, %hi(.LCPI23_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a1)
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8
-; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
+; ZVFH-NEXT:    lui a0, %hi(.LCPI23_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
 ; ZVFH-NEXT:    fsrmi a0, 1
+; ZVFH-NEXT:    vmflt.vf v0, v16, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    fsrm a0
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -995,11 +995,12 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    lui a3, 307200
+; ZVFHMIN-NEXT:    fsrmi a4, 1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v17, v16
@@ -1014,11 +1015,10 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    fsrmi a2, 1
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v17
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    fsrm a2
+; ZVFHMIN-NEXT:    fsrm a4
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
@@ -1033,10 +1033,10 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
-; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
@@ -1064,9 +1064,9 @@ define <vscale x 1 x float> @vp_roundtozero_nxv1f32(<vscale x 1 x float> %va, <v
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1085,8 +1085,8 @@ define <vscale x 1 x float> @vp_roundtozero_nxv1f32_unmasked(<vscale x 1 x float
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1106,9 +1106,9 @@ define <vscale x 2 x float> @vp_roundtozero_nxv2f32(<vscale x 2 x float> %va, <v
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1127,8 +1127,8 @@ define <vscale x 2 x float> @vp_roundtozero_nxv2f32_unmasked(<vscale x 2 x float
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1149,9 +1149,9 @@ define <vscale x 4 x float> @vp_roundtozero_nxv4f32(<vscale x 4 x float> %va, <v
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1171,8 +1171,8 @@ define <vscale x 4 x float> @vp_roundtozero_nxv4f32_unmasked(<vscale x 4 x float
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1193,9 +1193,9 @@ define <vscale x 8 x float> @vp_roundtozero_nxv8f32(<vscale x 8 x float> %va, <v
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1215,8 +1215,8 @@ define <vscale x 8 x float> @vp_roundtozero_nxv8f32_unmasked(<vscale x 8 x float
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1237,9 +1237,9 @@ define <vscale x 16 x float> @vp_roundtozero_nxv16f32(<vscale x 16 x float> %va,
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1259,8 +1259,8 @@ define <vscale x 16 x float> @vp_roundtozero_nxv16f32_unmasked(<vscale x 16 x fl
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1276,13 +1276,13 @@ declare <vscale x 1 x double> @llvm.vp.roundtozero.nxv1f64(<vscale x 1 x double>
 define <vscale x 1 x double> @vp_roundtozero_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8, v0.t
+; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI34_0)(a0)
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -1297,12 +1297,12 @@ define <vscale x 1 x double> @vp_roundtozero_nxv1f64(<vscale x 1 x double> %va,
 define <vscale x 1 x double> @vp_roundtozero_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv1f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -1320,12 +1320,12 @@ define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; CHECK-NEXT:    vfabs.v v12, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v12, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
@@ -1341,12 +1341,12 @@ define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va,
 define <vscale x 2 x double> @vp_roundtozero_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv2f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI37_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v10, v8
-; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -1364,12 +1364,12 @@ define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
@@ -1385,12 +1385,12 @@ define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va,
 define <vscale x 4 x double> @vp_roundtozero_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI39_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8
-; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI39_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -1408,12 +1408,12 @@ define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1429,12 +1429,12 @@ define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va,
 define <vscale x 7 x double> @vp_roundtozero_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv7f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI41_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI41_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1452,12 +1452,12 @@ define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a0)
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v16, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -1473,12 +1473,12 @@ define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va,
 define <vscale x 8 x double> @vp_roundtozero_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI43_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8
-; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -1498,59 +1498,66 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
+; CHECK-NEXT:    vslidedown.vx v25, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    fsrmi a3, 1
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmflt.vf v25, v8, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1570,12 +1577,12 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
@@ -1585,8 +1592,8 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
index 53d1666c30e96..6c2ea46302b06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
@@ -4,13 +4,13 @@
 define <vscale x 16 x i32> @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %w, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld a0, 0(sp)
-; CHECK-NEXT:    ld a1, 8(sp)
+; CHECK-NEXT:    ld a0, 8(sp)
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vl8re32.v v0, (a1)
+; CHECK-NEXT:    ld a0, 0(sp)
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v24
-; CHECK-NEXT:    vadd.vv v16, v16, v0
+; CHECK-NEXT:    vadd.vv v8, v8, v0
+; CHECK-NEXT:    vadd.vv v16, v16, v24
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %s0 = add <vscale x 16 x i32> %w, %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index d329979857a6b..d5b8362403d1d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -968,8 +968,8 @@ define <vscale x 2 x float> @vfredusum(<vscale x 2 x float> %passthru, <vscale x
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv1r.v v11, v8
 ; CHECK-NEXT:    vfredusum.vs v11, v9, v10
-; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.riscv.vfredusum.nxv2f32.nxv2f32(
     <vscale x 2 x float> %passthru,
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index e6272701a6033..9a4121b41c3f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1358,13 +1358,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    slli a3, a1, 5
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -1392,79 +1389,78 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    and a7, a7, a1
 ; CHECK-NEXT:    srli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
+; CHECK-NEXT:    addi t0, sp, 16
 ; CHECK-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vslidedown.vx v25, v24, a1
 ; CHECK-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v25, a3
-; CHECK-NEXT:    vl8re16.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 5
-; CHECK-NEXT:    add a0, t0, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 3
-; CHECK-NEXT:    add a0, t0, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v0
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    add t0, sp, t0
+; CHECK-NEXT:    addi t0, t0, 16
+; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a7, vlenb
+; CHECK-NEXT:    slli t0, a7, 4
+; CHECK-NEXT:    add a7, t0, a7
+; CHECK-NEXT:    add a7, sp, a7
+; CHECK-NEXT:    addi a7, a7, 16
+; CHECK-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    csrr a7, vlenb
+; CHECK-NEXT:    slli t0, a7, 3
+; CHECK-NEXT:    add a7, t0, a7
+; CHECK-NEXT:    add a7, sp, a7
+; CHECK-NEXT:    addi a7, a7, 16
+; CHECK-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a7, vlenb
+; CHECK-NEXT:    slli t0, a7, 4
+; CHECK-NEXT:    add a7, t0, a7
+; CHECK-NEXT:    add a7, sp, a7
+; CHECK-NEXT:    addi a7, a7, 16
+; CHECK-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v4, v16, v8, v0.t
+; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a7, a0, 4
 ; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a6, a4, .LBB85_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a6, a4
 ; CHECK-NEXT:  .LBB85_2:
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a7, a0, 3
-; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a6, a0, 4
+; CHECK-NEXT:    slli a6, a0, 3
 ; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v5, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; CHECK-NEXT:    add a0, a3, a3
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a5
 ; CHECK-NEXT:  .LBB85_4:
 ; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a6, sp, 16
+; CHECK-NEXT:    vl1r.v v5, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK-NEXT:    vslidedown.vx v0, v5, a3
 ; CHECK-NEXT:    sltu a6, a2, a5
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
@@ -1476,60 +1472,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    slli a6, a5, 4
 ; CHECK-NEXT:    add a5, a6, a5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a6, a5, 5
-; CHECK-NEXT:    add a5, a6, a5
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a6, a5, 4
-; CHECK-NEXT:    add a5, a6, a5
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v5, v6, a3
+; CHECK-NEXT:    vslideup.vx v7, v4, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 5
+; CHECK-NEXT:    slli a4, a2, 4
 ; CHECK-NEXT:    add a2, a4, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v4, a3
+; CHECK-NEXT:    vslideup.vx v8, v6, a3
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v5, a1
+; CHECK-NEXT:    vslideup.vx v8, v7, a1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -3498,59 +3488,33 @@ declare <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f16(<vscale x 64 x half>, <vscale
 define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; ZVFH-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vmv1r.v v24, v0
+; ZVFH-NEXT:    vmv1r.v v7, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    add a1, sp, a1
-; ZVFH-NEXT:    addi a1, a1, 16
-; ZVFH-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    csrr a3, vlenb
-; ZVFH-NEXT:    srli a1, a3, 1
-; ZVFH-NEXT:    slli a4, a3, 3
-; ZVFH-NEXT:    slli a3, a3, 2
+; ZVFH-NEXT:    slli a4, a1, 3
+; ZVFH-NEXT:    slli a3, a1, 2
 ; ZVFH-NEXT:    add a4, a0, a4
 ; ZVFH-NEXT:    sub a5, a2, a3
-; ZVFH-NEXT:    vl8re16.v v8, (a4)
+; ZVFH-NEXT:    vl8re16.v v24, (a4)
 ; ZVFH-NEXT:    sltu a4, a2, a5
 ; ZVFH-NEXT:    addi a4, a4, -1
-; ZVFH-NEXT:    vl8re16.v v0, (a0)
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vslidedown.vx v0, v24, a1
 ; ZVFH-NEXT:    and a4, a4, a5
+; ZVFH-NEXT:    srli a1, a1, 1
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v8, v0.t
+; ZVFH-NEXT:    vmfeq.vv v6, v16, v24, v0.t
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
 ; ZVFH-NEXT:    bltu a2, a3, .LBB171_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a2, a3
 ; ZVFH-NEXT:  .LBB171_2:
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
 ; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFH-NEXT:    vslideup.vx v16, v7, a1
+; ZVFH-NEXT:    vslideup.vx v16, v6, a1
 ; ZVFH-NEXT:    vmv.v.v v0, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16:
@@ -3558,13 +3522,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a3
+; ZVFHMIN-NEXT:    slli a3, a1, 5
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
@@ -3592,79 +3553,78 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    and a7, a7, a1
 ; ZVFHMIN-NEXT:    srli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    slli t0, t0, 3
-; ZVFHMIN-NEXT:    add t0, sp, t0
-; ZVFHMIN-NEXT:    addi t0, t0, 16
+; ZVFHMIN-NEXT:    addi t0, sp, 16
 ; ZVFHMIN-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vslidedown.vx v25, v24, a1
 ; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v24, v25, a3
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 5
-; ZVFHMIN-NEXT:    add a0, t0, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 3
-; ZVFHMIN-NEXT:    add a0, t0, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v0
+; ZVFHMIN-NEXT:    csrr t0, vlenb
+; ZVFHMIN-NEXT:    add t0, sp, t0
+; ZVFHMIN-NEXT:    addi t0, t0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a7, vlenb
+; ZVFHMIN-NEXT:    slli t0, a7, 4
+; ZVFHMIN-NEXT:    add a7, t0, a7
+; ZVFHMIN-NEXT:    add a7, sp, a7
+; ZVFHMIN-NEXT:    addi a7, a7, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a7, vlenb
+; ZVFHMIN-NEXT:    slli t0, a7, 3
+; ZVFHMIN-NEXT:    add a7, t0, a7
+; ZVFHMIN-NEXT:    add a7, sp, a7
+; ZVFHMIN-NEXT:    addi a7, a7, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a7) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    csrr a7, vlenb
+; ZVFHMIN-NEXT:    slli t0, a7, 4
+; ZVFHMIN-NEXT:    add a7, t0, a7
+; ZVFHMIN-NEXT:    add a7, sp, a7
+; ZVFHMIN-NEXT:    addi a7, a7, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v4, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a7, a0, 4
 ; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB171_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 3
-; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a6, a0, 4
+; ZVFHMIN-NEXT:    slli a6, a0, 3
 ; ZVFHMIN-NEXT:    add a0, a6, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB171_4:
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 3
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a6, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v5, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v5, a3
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
@@ -3676,60 +3636,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
 ; ZVFHMIN-NEXT:    slli a6, a5, 4
 ; ZVFHMIN-NEXT:    add a5, a6, a5
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a6, a5, 5
-; ZVFHMIN-NEXT:    add a5, a6, a5
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a6, a5, 4
-; ZVFHMIN-NEXT:    add a5, a6, a5
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
+; ZVFHMIN-NEXT:    vslideup.vx v7, v4, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmv1r.v v0, v5
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    slli a4, a2, 4
 ; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
+; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a1
+; ZVFHMIN-NEXT:    vslideup.vx v8, v7, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 5
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 9c733b17dc6e9..7c96fce145d30 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1077,55 +1077,29 @@ declare <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8>, <vscale
 define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vv_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
-; CHECK-NEXT:    vl8r.v v8, (a2)
+; CHECK-NEXT:    vl8r.v v24, (a2)
 ; CHECK-NEXT:    sltu a2, a3, a4
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0)
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB96_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmv1r.v v8, v6
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, metadata !"eq", <vscale x 128 x i1> %m, i32 %evl)
   ret <vscale x 128 x i1> %v
@@ -2223,59 +2197,33 @@ declare <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32>, <vscale x
 define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: icmp_eq_vv_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a1, a3, 2
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    slli a4, a1, 3
+; CHECK-NEXT:    slli a3, a1, 1
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sub a5, a2, a3
-; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    vl8re32.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a2, a5
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v24, a1
 ; CHECK-NEXT:    and a4, a4, a5
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    bltu a2, a3, .LBB189_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB189_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v7, a1
+; CHECK-NEXT:    vslideup.vx v16, v6, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i1> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll b/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll
index d12f2c889650f..eb6635117d0a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll
@@ -17,8 +17,8 @@ define void @vecaddr_straightline(i32 zeroext %a, ptr %p) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (s0)
 ; RV32-NEXT:    vadd.vi v8, v8, 1
-; RV32-NEXT:    li a1, 57
 ; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    li a1, 57
 ; RV32-NEXT:    beq a0, a1, .LBB0_2
 ; RV32-NEXT:  # %bb.1: # %do_call
 ; RV32-NEXT:    call foo
@@ -47,8 +47,8 @@ define void @vecaddr_straightline(i32 zeroext %a, ptr %p) {
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vle32.v v8, (s0)
 ; RV64-NEXT:    vadd.vi v8, v8, 1
-; RV64-NEXT:    li a1, 57
 ; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    li a1, 57
 ; RV64-NEXT:    beq a0, a1, .LBB0_2
 ; RV64-NEXT:  # %bb.1: # %do_call
 ; RV64-NEXT:    call foo
@@ -97,8 +97,8 @@ define void @vecaddr_loop(i32 zeroext %a, ptr %p) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (s0)
 ; RV32-NEXT:    vadd.vi v8, v8, 1
-; RV32-NEXT:    li a1, 57
 ; RV32-NEXT:    vse32.v v8, (s0)
+; RV32-NEXT:    li a1, 57
 ; RV32-NEXT:    beq a0, a1, .LBB1_2
 ; RV32-NEXT:  .LBB1_1: # %do_call
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -129,8 +129,8 @@ define void @vecaddr_loop(i32 zeroext %a, ptr %p) {
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vle32.v v8, (s0)
 ; RV64-NEXT:    vadd.vi v8, v8, 1
-; RV64-NEXT:    li a1, 57
 ; RV64-NEXT:    vse32.v v8, (s0)
+; RV64-NEXT:    li a1, 57
 ; RV64-NEXT:    beq a0, a1, .LBB1_2
 ; RV64-NEXT:  .LBB1_1: # %do_call
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 1948675ae9cf0..c0792566160ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -1457,19 +1457,19 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fmul_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB26_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB26_5
 ; CHECK-NEXT:  .LBB26_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB26_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1477,12 +1477,12 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB26_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB26_7
 ; CHECK-NEXT:  .LBB26_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -1547,19 +1547,19 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB27_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB27_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB27_5
 ; CHECK-NEXT:  .LBB27_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB27_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1567,12 +1567,12 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB27_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB27_7
 ; CHECK-NEXT:  .LBB27_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -1637,19 +1637,19 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB28_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB28_5
 ; CHECK-NEXT:  .LBB28_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB28_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1657,12 +1657,12 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB28_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB28_7
 ; CHECK-NEXT:  .LBB28_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -1727,19 +1727,19 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fadd_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB29_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB29_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB29_5
 ; CHECK-NEXT:  .LBB29_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB29_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1747,12 +1747,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB29_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB29_7
 ; CHECK-NEXT:  .LBB29_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -1817,19 +1817,19 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fsub_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB30_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB30_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB30_5
 ; CHECK-NEXT:  .LBB30_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB30_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1837,12 +1837,12 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB30_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB30_7
 ; CHECK-NEXT:  .LBB30_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -1907,19 +1907,19 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frsub_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 2
-; CHECK-NEXT:    li a2, 1024
-; CHECK-NEXT:    bgeu a2, a3, .LBB31_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    li a1, 1024
+; CHECK-NEXT:    bgeu a1, a3, .LBB31_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    j .LBB31_5
 ; CHECK-NEXT:  .LBB31_2: # %vector.ph
-; CHECK-NEXT:    addi a2, a3, -1
-; CHECK-NEXT:    andi a4, a2, 1024
-; CHECK-NEXT:    xori a2, a4, 1024
+; CHECK-NEXT:    addi a1, a3, -1
+; CHECK-NEXT:    andi a4, a1, 1024
+; CHECK-NEXT:    xori a1, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    mv a6, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB31_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1927,12 +1927,12 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    add a5, a5, a2
 ; CHECK-NEXT:    bnez a6, .LBB31_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB31_7
 ; CHECK-NEXT:  .LBB31_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a2, 2
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
@@ -2073,35 +2073,35 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) {
 ; CHECK-LABEL: sink_splat_fma_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a4, a2, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a4, .LBB34_2
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    srli a4, a3, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a4, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB34_5
 ; CHECK-NEXT:  .LBB34_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a4, -1
-; CHECK-NEXT:    andi a5, a3, 1024
-; CHECK-NEXT:    xori a3, a5, 1024
+; CHECK-NEXT:    addi a2, a4, -1
+; CHECK-NEXT:    andi a5, a2, 1024
+; CHECK-NEXT:    xori a2, a5, 1024
 ; CHECK-NEXT:    mv a6, a0
 ; CHECK-NEXT:    mv a7, a1
-; CHECK-NEXT:    mv t0, a3
+; CHECK-NEXT:    mv t0, a2
 ; CHECK-NEXT:    vsetvli t1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB34_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a6)
 ; CHECK-NEXT:    vl1re32.v v9, (a7)
 ; CHECK-NEXT:    sub t0, t0, a4
-; CHECK-NEXT:    add a7, a7, a2
+; CHECK-NEXT:    add a7, a7, a3
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vs1r.v v9, (a6)
-; CHECK-NEXT:    add a6, a6, a2
+; CHECK-NEXT:    add a6, a6, a3
 ; CHECK-NEXT:    bnez t0, .LBB34_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB34_7
 ; CHECK-NEXT:  .LBB34_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a2, a1, a2
@@ -2173,35 +2173,35 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) {
 ; CHECK-LABEL: sink_splat_fma_commute_scalable:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a4, a2, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a4, .LBB35_2
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    srli a4, a3, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a4, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB35_5
 ; CHECK-NEXT:  .LBB35_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a4, -1
-; CHECK-NEXT:    andi a5, a3, 1024
-; CHECK-NEXT:    xori a3, a5, 1024
+; CHECK-NEXT:    addi a2, a4, -1
+; CHECK-NEXT:    andi a5, a2, 1024
+; CHECK-NEXT:    xori a2, a5, 1024
 ; CHECK-NEXT:    mv a6, a0
 ; CHECK-NEXT:    mv a7, a1
-; CHECK-NEXT:    mv t0, a3
+; CHECK-NEXT:    mv t0, a2
 ; CHECK-NEXT:    vsetvli t1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB35_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a6)
 ; CHECK-NEXT:    vl1re32.v v9, (a7)
 ; CHECK-NEXT:    sub t0, t0, a4
-; CHECK-NEXT:    add a7, a7, a2
+; CHECK-NEXT:    add a7, a7, a3
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vs1r.v v9, (a6)
-; CHECK-NEXT:    add a6, a6, a2
+; CHECK-NEXT:    add a6, a6, a3
 ; CHECK-NEXT:    bnez t0, .LBB35_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB35_7
 ; CHECK-NEXT:  .LBB35_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a2, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index 62339130678d0..86cf1ee04b60a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -561,14 +561,14 @@ define <vscale x 16 x i64> @add_stepvector_nxv16i64() {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -596,16 +596,16 @@ define <vscale x 16 x i64> @mul_stepvector_nxv16i64() {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    li a1, 3
+; RV32-NEXT:    vmul.vx v8, v8, a1
 ; RV32-NEXT:    slli a1, a0, 1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    li a0, 3
-; RV32-NEXT:    vmul.vx v8, v8, a0
 ; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -649,16 +649,16 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV32-NEXT:    slli a3, a0, 1
 ; RV32-NEXT:    slli a0, a0, 6
 ; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    sw a2, 0(sp)
 ; RV32-NEXT:    sw a0, 4(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vid.v v24
-; RV32-NEXT:    vmul.vv v8, v24, v8
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -689,14 +689,14 @@ define <vscale x 16 x i64> @shl_stepvector_nxv16i64() {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index ecd098edb30ae..881a8795cc504 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -676,9 +676,9 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:  .LBB55_2:
 ; CHECK-RV32-NEXT:    mul a5, a3, a1
 ; CHECK-RV32-NEXT:    srli a4, a4, 3
+; CHECK-RV32-NEXT:    add a5, a0, a5
 ; CHECK-RV32-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v8, v9, a4
-; CHECK-RV32-NEXT:    add a5, a0, a5
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (a5), a1, v0.t
@@ -702,9 +702,9 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:  .LBB55_2:
 ; CHECK-RV64-NEXT:    mul a5, a2, a1
 ; CHECK-RV64-NEXT:    srli a4, a4, 3
+; CHECK-RV64-NEXT:    add a5, a0, a5
 ; CHECK-RV64-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v8, v9, a4
-; CHECK-RV64-NEXT:    add a5, a0, a5
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (a5), a1, v0.t
@@ -788,9 +788,9 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    mul t1, a5, a1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
 ; CHECK-RV32-NEXT:    sub a7, a3, a7
+; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, t2
-; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (t1), a1, v0.t
 ; CHECK-RV32-NEXT:    sltu a3, a3, a7
@@ -802,9 +802,9 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:  .LBB57_6:
 ; CHECK-RV32-NEXT:    mul a6, a6, a1
 ; CHECK-RV32-NEXT:    srli a2, a2, 2
+; CHECK-RV32-NEXT:    add a6, a0, a6
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-RV32-NEXT:    add a6, a0, a6
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a6), a1, v0.t
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
@@ -836,9 +836,9 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    mul t1, a5, a1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
 ; CHECK-RV64-NEXT:    sub a7, a2, a7
+; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, t2
-; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (t1), a1, v0.t
 ; CHECK-RV64-NEXT:    sltu a2, a2, a7
@@ -850,9 +850,9 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:  .LBB57_6:
 ; CHECK-RV64-NEXT:    mul a6, a6, a1
 ; CHECK-RV64-NEXT:    srli a4, a4, 2
+; CHECK-RV64-NEXT:    add a6, a0, a6
 ; CHECK-RV64-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, a4
-; CHECK-RV64-NEXT:    add a6, a0, a6
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v24, (a6), a1, v0.t
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 9dcee7e5cb7d1..b665be5825f15 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -567,11 +567,11 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    mul a4, a4, a1
 ; CHECK-NEXT:    srli a3, a3, 3
 ; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
@@ -646,21 +646,21 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    sltu a3, a3, a6
 ; CHECK-NEXT:    addi t0, t0, -1
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and t0, t0, a0
-; CHECK-NEXT:    and a0, a3, a6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT:    and a0, t0, a0
+; CHECK-NEXT:    addi t0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (t0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a7), a2, v0.t
+; CHECK-NEXT:    and a0, a3, a6
 ; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:  .LBB48_6:
 ; CHECK-NEXT:    mul a3, a5, a2
 ; CHECK-NEXT:    srli a4, a4, 2
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a1), a2, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
index 68e0c0089d0c7..a5dd27149c1f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll
@@ -7,10 +7,10 @@ define <vscale x 1 x i8> @umulo_nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %
 ; CHECK-LABEL: umulo_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i8>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %y)
   %b = extractvalue { <vscale x 1 x i8>, <vscale x 1 x i1> } %a, 0
@@ -25,10 +25,10 @@ define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %
 ; CHECK-LABEL: umulo_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y)
   %b = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 0
@@ -43,10 +43,10 @@ define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %
 ; CHECK-LABEL: umulo_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y)
   %b = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 0
@@ -61,10 +61,10 @@ define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %
 ; CHECK-LABEL: umulo_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y)
   %b = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 0
@@ -79,10 +79,10 @@ define <vscale x 16 x i8> @umulo_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-LABEL: umulo_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
   %b = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 0
@@ -97,10 +97,10 @@ define <vscale x 32 x i8> @umulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i
 ; CHECK-LABEL: umulo_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y)
   %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0
@@ -115,10 +115,10 @@ define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-LABEL: umulo_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -133,10 +133,10 @@ define <vscale x 1 x i16> @umulo_nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i1
 ; CHECK-LABEL: umulo_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i16>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %y)
   %b = extractvalue { <vscale x 1 x i16>, <vscale x 1 x i1> } %a, 0
@@ -151,10 +151,10 @@ define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1
 ; CHECK-LABEL: umulo_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y)
   %b = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 0
@@ -169,10 +169,10 @@ define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i1
 ; CHECK-LABEL: umulo_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y)
   %b = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 0
@@ -187,10 +187,10 @@ define <vscale x 8 x i16> @umulo_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-LABEL: umulo_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
   %b = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 0
@@ -205,10 +205,10 @@ define <vscale x 16 x i16> @umulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16
 ; CHECK-LABEL: umulo_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y)
   %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0
@@ -223,10 +223,10 @@ define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-LABEL: umulo_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -241,10 +241,10 @@ define <vscale x 1 x i32> @umulo_nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i3
 ; CHECK-LABEL: umulo_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i32>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y)
   %b = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } %a, 0
@@ -259,10 +259,10 @@ define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i3
 ; CHECK-LABEL: umulo_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)
   %b = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 0
@@ -277,10 +277,10 @@ define <vscale x 4 x i32> @umulo_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-LABEL: umulo_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
   %b = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 0
@@ -295,10 +295,10 @@ define <vscale x 8 x i32> @umulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i3
 ; CHECK-LABEL: umulo_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y)
   %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0
@@ -313,10 +313,10 @@ define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-LABEL: umulo_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -331,10 +331,10 @@ define <vscale x 1 x i64> @umulo_nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i6
 ; CHECK-LABEL: umulo_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v10, v8, v9
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v10, v8, v9
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 1 x i64>, <vscale x 1 x i1> } @llvm.umul.with.overflow.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %y)
   %b = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i1> } %a, 0
@@ -349,10 +349,10 @@ define <vscale x 2 x i64> @umulo_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-LABEL: umulo_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmulhu.vv v12, v8, v10
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v10
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
   %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 0
@@ -367,10 +367,10 @@ define <vscale x 4 x i64> @umulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i6
 ; CHECK-LABEL: umulo_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmulhu.vv v16, v8, v12
-; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v12
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v16, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v12
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v16, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
   %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0
@@ -385,10 +385,10 @@ define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-LABEL: umulo_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmulhu.vv v24, v8, v16
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
-; CHECK-NEXT:    vmul.vv v8, v8, v16
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT:    vmul.vv v24, v8, v16
+; CHECK-NEXT:    vmulhu.vv v8, v8, v16
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v24, 0, v0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
index 0bd82e654e021..2c89e939940b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
@@ -10,13 +10,13 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    vmul.vx v8, v8, a0
 ; RV32-NEXT:    lui a0, 3
-; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsll.vi v9, v8, 15
 ; RV32-NEXT:    vsrl.vi v8, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_vec_even_divisor_eq0:
@@ -26,13 +26,13 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a0
 ; RV64-NEXT:    lui a0, 3
-; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vsll.vi v9, v8, 15
 ; RV64-NEXT:    vsrl.vi v8, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV64-NEXT:    ret
   %urem = urem <vscale x 1 x i16> %x, splat (i16 6)
   %cmp = icmp ne <vscale x 1 x i16> %urem, splat (i16 0)
@@ -48,10 +48,10 @@ define <vscale x 1 x i16> @test_urem_vec_odd_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    vmul.vx v8, v8, a0
 ; RV32-NEXT:    lui a0, 3
+; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    addi a0, a0, 819
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_vec_odd_divisor_eq0:
@@ -61,10 +61,10 @@ define <vscale x 1 x i16> @test_urem_vec_odd_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a0
 ; RV64-NEXT:    lui a0, 3
+; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    addi a0, a0, 819
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV64-NEXT:    ret
   %urem = urem <vscale x 1 x i16> %x, splat (i16 5)
   %cmp = icmp ne <vscale x 1 x i16> %urem, splat (i16 0)
@@ -82,13 +82,13 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    addi a0, a0, -1365
 ; RV32-NEXT:    vmul.vx v8, v8, a0
 ; RV32-NEXT:    lui a0, 3
-; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsll.vi v9, v8, 15
 ; RV32-NEXT:    vsrl.vi v8, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_vec_even_divisor_eq1:
@@ -100,13 +100,13 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    addi a0, a0, -1365
 ; RV64-NEXT:    vmul.vx v8, v8, a0
 ; RV64-NEXT:    lui a0, 3
-; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vsll.vi v9, v8, 15
 ; RV64-NEXT:    vsrl.vi v8, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV64-NEXT:    ret
   %urem = urem <vscale x 1 x i16> %x, splat (i16 6)
   %cmp = icmp ne <vscale x 1 x i16> %urem, splat (i16 1)
@@ -124,10 +124,10 @@ define <vscale x 1 x i16> @test_urem_vec_odd_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    addi a0, a0, -819
 ; RV32-NEXT:    vmul.vx v8, v8, a0
 ; RV32-NEXT:    lui a0, 3
+; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    addi a0, a0, 818
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_vec_odd_divisor_eq1:
@@ -139,10 +139,10 @@ define <vscale x 1 x i16> @test_urem_vec_odd_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    addi a0, a0, -819
 ; RV64-NEXT:    vmul.vx v8, v8, a0
 ; RV64-NEXT:    lui a0, 3
+; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    addi a0, a0, 818
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vmerge.vim v8, v9, -1, v0
 ; RV64-NEXT:    ret
   %urem = urem <vscale x 1 x i16> %x, splat (i16 5)
   %cmp = icmp ne <vscale x 1 x i16> %urem, splat (i16 1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index 77f3cf3ca4980..cd1609f90c6b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1442,12 +1442,11 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx16(<vscale x 32 x i32> %va, <
 ; RV64-LABEL: vadd_vi_nxv32i32_evl_nx16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64-NEXT:    vadd.vi v8, v8, -1, v0.t
 ; RV64-NEXT:    srli a0, a0, 2
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a0
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vslidedown.vx v0, v0, a0
 ; RV64-NEXT:    vsetivli zero, 0, e32, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v16, v16, -1, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 9f0b2b3914836..6e9826b2fcdb3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -204,19 +204,19 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 8
 ; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v10, a1
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a2, a0, a0
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v14
-; CHECK-NEXT:    vs2r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    vs2r.v v8, (a1)
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg5e16.v v8, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -576,19 +576,19 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 8
 ; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v10, a1
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a2, a0, a0
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v14
-; CHECK-NEXT:    vs2r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    vs2r.v v8, (a1)
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg5e16.v v8, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 14f306da21dba..55359e82e9720 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -12,11 +12,12 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vlm.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
@@ -134,44 +135,62 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
 ; CHECK-NEXT:    li a1, 85
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    li a1, 170
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    vmv.v.x v17, a1
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vcompress.vm v8, v24, v16
 ; CHECK-NEXT:    vmv1r.v v12, v16
-; CHECK-NEXT:    vmv1r.v v13, v17
-; CHECK-NEXT:    vcompress.vm v16, v24, v13
-; CHECK-NEXT:    vcompress.vm v24, v0, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v24, v0, v13
+; CHECK-NEXT:    vcompress.vm v16, v0, v12
+; CHECK-NEXT:    vmv4r.v v12, v16
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v12, v24
+; CHECK-NEXT:    vs1r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v20, v24
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vcompress.vm v16, v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v20, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v24, v0, v20
+; CHECK-NEXT:    vmv4r.v v20, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 81b6de9e662d5..ea1a6fe03501b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -10,15 +10,14 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v0, a0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v14, v8, 1, v0
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -179,41 +178,63 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v7, a0
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.v.x v6, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vcompress.vm v24, v8, v7
 ; CHECK-NEXT:    vmv1r.v v28, v7
-; CHECK-NEXT:    vmv1r.v v29, v6
-; CHECK-NEXT:    vcompress.vm v0, v8, v29
-; CHECK-NEXT:    vcompress.vm v8, v16, v28
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v8, v16, v29
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v0, v16, v28
+; CHECK-NEXT:    vmv4r.v v28, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v7, a0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v28, v8
+; CHECK-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vcompress.vm v0, v8, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v8, v16, v24
 ; CHECK-NEXT:    vmv4r.v v4, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -401,41 +422,63 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v7, a0
 ; CHECK-NEXT:    li a0, 170
-; CHECK-NEXT:    vmv.v.x v6, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vcompress.vm v24, v8, v7
 ; CHECK-NEXT:    vmv1r.v v28, v7
-; CHECK-NEXT:    vmv1r.v v29, v6
-; CHECK-NEXT:    vcompress.vm v0, v8, v29
-; CHECK-NEXT:    vcompress.vm v8, v16, v28
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v8, v16, v29
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v0, v16, v28
+; CHECK-NEXT:    vmv4r.v v28, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v7, a0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v28, v8
+; CHECK-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vcompress.vm v0, v8, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v8, v16, v24
 ; CHECK-NEXT:    vmv4r.v v4, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -578,41 +621,41 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
-; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    srli a2, a0, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    vslidedown.vx v10, v9, a2
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vslidedown.vx v10, v0, a2
+; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    vslidedown.vx v11, v0, a2
+; CHECK-NEXT:    srli a2, a0, 3
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v10, v15
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v10, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v18, v11
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v11, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v8, v21
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v9, v14
-; CHECK-NEXT:    vs8r.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v12, v13
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v8, (a0)
-; CHECK-NEXT:    vlseg5e8.v v14, (a1)
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v19, v8
+; CHECK-NEXT:    vmv1r.v v16, v21
+; CHECK-NEXT:    vmv1r.v v17, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg5e8.v v8, (a1)
+; CHECK-NEXT:    vlseg5e8.v v14, (a2)
 ; CHECK-NEXT:    vmv2r.v v20, v8
 ; CHECK-NEXT:    vmv2r.v v22, v10
 ; CHECK-NEXT:    vmv1r.v v21, v14
@@ -643,17 +686,17 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vmv1r.v v28, v17
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg5e8.v v12, (a0)
@@ -684,17 +727,17 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vmv1r.v v28, v17
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg5e16.v v12, (a0)
@@ -725,17 +768,17 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vmv1r.v v28, v17
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg5e32.v v12, (a0)
@@ -766,17 +809,17 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v25, v14
 ; CHECK-NEXT:    vmv1r.v v28, v17
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg5e64.v v12, (a0)
@@ -808,52 +851,50 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    srli a0, a1, 1
-; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT:    srli a3, a0, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    vslidedown.vx v10, v8, a3
+; CHECK-NEXT:    srli a3, a0, 1
+; CHECK-NEXT:    vslidedown.vx v11, v9, a3
+; CHECK-NEXT:    vslidedown.vx v12, v8, a3
+; CHECK-NEXT:    srli a3, a0, 3
 ; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vslidedown.vx v11, v8, a2
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
-; CHECK-NEXT:    sub a1, a1, a3
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v9, v9, a1
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v22, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v10, v15
+; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
+; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v11, v24
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v23
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v9, v14
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vmv1r.v v12, v25
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v13, v14
-; CHECK-NEXT:    vs8r.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v14, v15
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v8, (a0)
-; CHECK-NEXT:    vlseg7e8.v v16, (a1)
+; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v18, v9
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v19, v10
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v12, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v16, v23
+; CHECK-NEXT:    vmv1r.v v17, v8
+; CHECK-NEXT:    vmv1r.v v20, v11
+; CHECK-NEXT:    vmv1r.v v21, v12
+; CHECK-NEXT:    vmv1r.v v22, v13
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg7e8.v v8, (a1)
+; CHECK-NEXT:    vlseg7e8.v v16, (a2)
 ; CHECK-NEXT:    vmv2r.v v24, v8
 ; CHECK-NEXT:    vmv2r.v v26, v10
 ; CHECK-NEXT:    vmv2r.v v28, v12
@@ -889,19 +930,19 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v30, v21
-; CHECK-NEXT:    vmv1r.v v28, v19
-; CHECK-NEXT:    vmv1r.v v29, v20
-; CHECK-NEXT:    vmv1r.v v26, v17
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v30, v21
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v19
+; CHECK-NEXT:    vmv1r.v v29, v20
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vmv1r.v v25, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg7e8.v v14, (a0)
@@ -936,19 +977,19 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v30, v21
-; CHECK-NEXT:    vmv1r.v v28, v19
-; CHECK-NEXT:    vmv1r.v v29, v20
-; CHECK-NEXT:    vmv1r.v v26, v17
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v30, v21
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v19
+; CHECK-NEXT:    vmv1r.v v29, v20
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vmv1r.v v25, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg7e16.v v14, (a0)
@@ -983,19 +1024,19 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v30, v21
-; CHECK-NEXT:    vmv1r.v v28, v19
-; CHECK-NEXT:    vmv1r.v v29, v20
-; CHECK-NEXT:    vmv1r.v v26, v17
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v30, v21
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v19
+; CHECK-NEXT:    vmv1r.v v29, v20
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vmv1r.v v25, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg7e32.v v14, (a0)
@@ -1030,19 +1071,19 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v30, v21
-; CHECK-NEXT:    vmv1r.v v28, v19
-; CHECK-NEXT:    vmv1r.v v29, v20
-; CHECK-NEXT:    vmv1r.v v26, v17
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v30, v21
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v19
+; CHECK-NEXT:    vmv1r.v v29, v20
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vmv1r.v v27, v18
+; CHECK-NEXT:    vmv1r.v v24, v15
 ; CHECK-NEXT:    vmv1r.v v25, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vlseg7e64.v v14, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
index 10929394af75f..5489041dd0367 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
@@ -8,8 +8,8 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
@@ -31,8 +31,8 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
 ; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB1_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
@@ -55,8 +55,8 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
 ; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB2_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
@@ -79,8 +79,8 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vcpop.m a2, v0
 ; RV32-NEXT:    vid.v v9, v0.t
+; RV32-NEXT:    vcpop.m a2, v0
 ; RV32-NEXT:    beqz a2, .LBB3_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vredmaxu.vs v9, v9, v9
@@ -102,8 +102,8 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; RV64-NEXT:    vmv.v.i v9, 0
-; RV64-NEXT:    vcpop.m a1, v0
 ; RV64-NEXT:    vid.v v9, v0.t
+; RV64-NEXT:    vcpop.m a1, v0
 ; RV64-NEXT:    beqz a1, .LBB3_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vredmaxu.vs v9, v9, v9
@@ -126,8 +126,8 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
 ; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    beqz a0, .LBB4_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
@@ -150,8 +150,8 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
 ; CHECK-NEXT:    vmsne.vi v0, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    beqz a0, .LBB5_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v9, v9, v9
@@ -172,8 +172,8 @@ define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB6_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
@@ -193,8 +193,8 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
@@ -214,8 +214,8 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vcpop.m a1, v0
 ; CHECK-NEXT:    beqz a1, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
@@ -235,8 +235,8 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, mu
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    vcpop.m a2, v0
 ; RV32-NEXT:    vid.v v10, v0.t
+; RV32-NEXT:    vcpop.m a2, v0
 ; RV32-NEXT:    beqz a2, .LBB9_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    vredmaxu.vs v10, v10, v10
@@ -256,8 +256,8 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, mu
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    vcpop.m a1, v0
 ; RV64-NEXT:    vid.v v10, v0.t
+; RV64-NEXT:    vcpop.m a1, v0
 ; RV64-NEXT:    beqz a1, .LBB9_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    vredmaxu.vs v10, v10, v10
@@ -277,8 +277,8 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    beqz a0, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
@@ -298,8 +298,8 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    vcpop.m a0, v0
 ; CHECK-NEXT:    beqz a0, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vredmaxu.vs v10, v10, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 1e4cb06480163..d45cf2320f253 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -204,11 +204,11 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; CHECK-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vle16.v v9, (a2)
 ; CHECK-NEXT:    vle16.v v10, (a4)
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vle16.v v12, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
@@ -241,11 +241,11 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZVBB-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a1, a4, a1
 ; ZVBB-NEXT:    vle16.v v9, (a2)
 ; ZVBB-NEXT:    vle16.v v10, (a4)
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    add a1, a4, a1
 ; ZVBB-NEXT:    vle16.v v12, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v11, v10, 2
@@ -283,24 +283,24 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; CHECK-NEXT:    vsseg7e8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a4)
 ; CHECK-NEXT:    add a4, a4, a1
-; CHECK-NEXT:    vle8.v v10, (a2)
-; CHECK-NEXT:    add a2, a4, a1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle8.v v10, (a3)
+; CHECK-NEXT:    add a3, a4, a1
 ; CHECK-NEXT:    vle8.v v11, (a2)
-; CHECK-NEXT:    vle8.v v12, (a4)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vle8.v v12, (a3)
+; CHECK-NEXT:    vle8.v v13, (a4)
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vle8.v v13, (a1)
-; CHECK-NEXT:    vle8.v v14, (a3)
+; CHECK-NEXT:    vle8.v v14, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vslideup.vi v13, v12, 2
+; CHECK-NEXT:    vslideup.vi v8, v11, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v13, 4
-; CHECK-NEXT:    vslideup.vi v8, v14, 4
+; CHECK-NEXT:    vslideup.vi v13, v14, 4
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v13, 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -325,24 +325,24 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; ZVBB-NEXT:    vsseg7e8.v v8, (a0)
 ; ZVBB-NEXT:    vle8.v v9, (a4)
 ; ZVBB-NEXT:    add a4, a4, a1
-; ZVBB-NEXT:    vle8.v v10, (a2)
-; ZVBB-NEXT:    add a2, a4, a1
-; ZVBB-NEXT:    add a1, a2, a1
+; ZVBB-NEXT:    vle8.v v10, (a3)
+; ZVBB-NEXT:    add a3, a4, a1
 ; ZVBB-NEXT:    vle8.v v11, (a2)
-; ZVBB-NEXT:    vle8.v v12, (a4)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vle8.v v12, (a3)
+; ZVBB-NEXT:    vle8.v v13, (a4)
 ; ZVBB-NEXT:    vle8.v v8, (a0)
-; ZVBB-NEXT:    vle8.v v13, (a1)
-; ZVBB-NEXT:    vle8.v v14, (a3)
+; ZVBB-NEXT:    vle8.v v14, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; ZVBB-NEXT:    vslideup.vi v12, v11, 2
-; ZVBB-NEXT:    vslideup.vi v8, v10, 2
+; ZVBB-NEXT:    vslideup.vi v13, v12, 2
+; ZVBB-NEXT:    vslideup.vi v8, v11, 2
 ; ZVBB-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; ZVBB-NEXT:    vslideup.vi v12, v13, 4
-; ZVBB-NEXT:    vslideup.vi v8, v14, 4
+; ZVBB-NEXT:    vslideup.vi v13, v14, 4
+; ZVBB-NEXT:    vslideup.vi v8, v10, 4
 ; ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v8, v9, 6
 ; ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v12, 8
+; ZVBB-NEXT:    vslideup.vi v8, v13, 8
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    .cfi_def_cfa sp, 16
@@ -579,11 +579,11 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; CHECK-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vle16.v v9, (a2)
 ; CHECK-NEXT:    vle16.v v10, (a4)
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vle16.v v12, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
@@ -616,11 +616,11 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZVBB-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a1, a4, a1
 ; ZVBB-NEXT:    vle16.v v9, (a2)
 ; ZVBB-NEXT:    vle16.v v10, (a4)
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    add a1, a4, a1
 ; ZVBB-NEXT:    vle16.v v12, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v11, v10, 2
@@ -659,24 +659,24 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; CHECK-NEXT:    vsseg7e16.v v8, (a0)
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    add a4, a4, a1
-; CHECK-NEXT:    vle16.v v10, (a2)
-; CHECK-NEXT:    add a2, a4, a1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle16.v v10, (a3)
+; CHECK-NEXT:    add a3, a4, a1
 ; CHECK-NEXT:    vle16.v v11, (a2)
-; CHECK-NEXT:    vle16.v v12, (a4)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    vle16.v v13, (a4)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vle16.v v13, (a1)
-; CHECK-NEXT:    vle16.v v14, (a3)
+; CHECK-NEXT:    vle16.v v14, (a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v12, 1
+; CHECK-NEXT:    vslideup.vi v8, v11, 1
 ; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vslideup.vi v8, v14, 2
+; CHECK-NEXT:    vslideup.vi v13, v14, 2
+; CHECK-NEXT:    vslideup.vi v8, v10, 2
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vslideup.vi v8, v13, 4
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
@@ -703,24 +703,24 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; ZVBB-NEXT:    vsseg7e16.v v8, (a0)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    add a4, a4, a1
-; ZVBB-NEXT:    vle16.v v10, (a2)
-; ZVBB-NEXT:    add a2, a4, a1
-; ZVBB-NEXT:    add a1, a2, a1
+; ZVBB-NEXT:    vle16.v v10, (a3)
+; ZVBB-NEXT:    add a3, a4, a1
 ; ZVBB-NEXT:    vle16.v v11, (a2)
-; ZVBB-NEXT:    vle16.v v12, (a4)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    vle16.v v13, (a4)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vle16.v v13, (a1)
-; ZVBB-NEXT:    vle16.v v14, (a3)
+; ZVBB-NEXT:    vle16.v v14, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; ZVBB-NEXT:    vslideup.vi v12, v11, 1
-; ZVBB-NEXT:    vslideup.vi v8, v10, 1
+; ZVBB-NEXT:    vslideup.vi v13, v12, 1
+; ZVBB-NEXT:    vslideup.vi v8, v11, 1
 ; ZVBB-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; ZVBB-NEXT:    vslideup.vi v12, v13, 2
-; ZVBB-NEXT:    vslideup.vi v8, v14, 2
+; ZVBB-NEXT:    vslideup.vi v13, v14, 2
+; ZVBB-NEXT:    vslideup.vi v8, v10, 2
 ; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v8, v9, 3
 ; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v12, 4
+; ZVBB-NEXT:    vslideup.vi v8, v13, 4
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 6aa62c2256925..53ec22f361254 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -124,9 +124,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vand.vi v13, v12, 1
-; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -139,9 +139,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
 ; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    vand.vi v13, v12, 1
-; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
+; ZVBB-NEXT:    vand.vi v12, v12, 1
+; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -287,13 +287,13 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vmv4r.v v28, v16
 ; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vsrl.vi v4, v6, 1
 ; CHECK-NEXT:    vand.vi v8, v6, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vsrl.vi v6, v6, 1
-; CHECK-NEXT:    vadd.vx v6, v6, a0, v0.t
+; CHECK-NEXT:    vadd.vx v4, v4, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v24, v6
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v6
+; CHECK-NEXT:    vrgatherei16.vv v8, v24, v4
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v4
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    ret
 ;
@@ -306,13 +306,13 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vmv4r.v v28, v16
 ; ZVBB-NEXT:    vmv4r.v v16, v12
+; ZVBB-NEXT:    vsrl.vi v4, v6, 1
 ; ZVBB-NEXT:    vand.vi v8, v6, 1
 ; ZVBB-NEXT:    vmsne.vi v0, v8, 0
-; ZVBB-NEXT:    vsrl.vi v6, v6, 1
-; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT:    vadd.vx v4, v4, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
-; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
+; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v4
+; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v4
 ; ZVBB-NEXT:    vmv.v.v v16, v24
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
@@ -527,9 +527,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vand.vi v13, v12, 1
-; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -542,9 +542,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
 ; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    vand.vi v13, v12, 1
-; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
+; ZVBB-NEXT:    vand.vi v12, v12, 1
+; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
@@ -648,13 +648,13 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vmv4r.v v28, v16
 ; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vsrl.vi v4, v6, 1
 ; CHECK-NEXT:    vand.vi v8, v6, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vsrl.vi v6, v6, 1
-; CHECK-NEXT:    vadd.vx v6, v6, a0, v0.t
+; CHECK-NEXT:    vadd.vx v4, v4, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v24, v6
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v6
+; CHECK-NEXT:    vrgatherei16.vv v8, v24, v4
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v4
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    ret
 ;
@@ -667,13 +667,13 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vmv4r.v v28, v16
 ; ZVBB-NEXT:    vmv4r.v v16, v12
+; ZVBB-NEXT:    vsrl.vi v4, v6, 1
 ; ZVBB-NEXT:    vand.vi v8, v6, 1
 ; ZVBB-NEXT:    vmsne.vi v0, v8, 0
-; ZVBB-NEXT:    vsrl.vi v6, v6, 1
-; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT:    vadd.vx v4, v4, a0, v0.t
 ; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
-; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
+; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v4
+; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v4
 ; ZVBB-NEXT:    vmv.v.v v16, v24
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
@@ -745,12 +745,12 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    srli a2, a1, 2
 ; CHECK-NEXT:    srli a1, a1, 1
 ; CHECK-NEXT:    vl2r.v v10, (a3)
+; CHECK-NEXT:    add a3, a2, a2
 ; CHECK-NEXT:    vl2r.v v12, (a0)
-; CHECK-NEXT:    add a0, a2, a2
 ; CHECK-NEXT:    vmsne.vi v14, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a2
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -788,12 +788,12 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    srli a2, a1, 2
 ; ZVBB-NEXT:    srli a1, a1, 1
 ; ZVBB-NEXT:    vl2r.v v10, (a3)
+; ZVBB-NEXT:    add a3, a2, a2
 ; ZVBB-NEXT:    vl2r.v v12, (a0)
-; ZVBB-NEXT:    add a0, a2, a2
 ; ZVBB-NEXT:    vmsne.vi v14, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a2
 ; ZVBB-NEXT:    add a0, a1, a1
 ; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -1045,12 +1045,12 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmv2r.v v20, v14
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    add a5, a2, a1
 ; CHECK-NEXT:    vmv1r.v v21, v18
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v22, v16
 ; CHECK-NEXT:    vmv1r.v v16, v19
-; CHECK-NEXT:    add a5, a2, a1
 ; CHECK-NEXT:    vmv1r.v v23, v8
 ; CHECK-NEXT:    vmv1r.v v18, v9
 ; CHECK-NEXT:    vmv1r.v v0, v11
@@ -1121,12 +1121,12 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv2r.v v20, v14
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vmerge.vim v16, v12, 1, v0
+; ZVBB-NEXT:    add a5, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v21, v18
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v12, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v22, v16
 ; ZVBB-NEXT:    vmv1r.v v16, v19
-; ZVBB-NEXT:    add a5, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v23, v8
 ; ZVBB-NEXT:    vmv1r.v v18, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v11
@@ -1192,26 +1192,26 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a2, a1, 2
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv2r.v v22, v16
 ; RV32-NEXT:    vmv2r.v v24, v18
 ; RV32-NEXT:    vmv1r.v v26, v20
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vmv1r.v v23, v10
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vmv1r.v v25, v14
 ; RV32-NEXT:    vsseg5e8.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v18, v11
 ; RV32-NEXT:    vmv1r.v v20, v15
 ; RV32-NEXT:    vsseg5e8.v v17, (a1)
 ; RV32-NEXT:    vl1r.v v16, (a6)
@@ -1230,10 +1230,10 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    vl1r.v v15, (a5)
 ; RV32-NEXT:    vl1r.v v12, (a6)
 ; RV32-NEXT:    vl1r.v v13, (a1)
-; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vs2r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
@@ -1258,26 +1258,26 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 2
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv2r.v v22, v16
 ; RV64-NEXT:    vmv2r.v v24, v18
 ; RV64-NEXT:    vmv1r.v v26, v20
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    vmv1r.v v23, v10
 ; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vmv1r.v v25, v14
 ; RV64-NEXT:    vsseg5e8.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v18, v11
 ; RV64-NEXT:    vmv1r.v v20, v15
 ; RV64-NEXT:    vsseg5e8.v v17, (a1)
 ; RV64-NEXT:    vl1r.v v16, (a6)
@@ -1296,10 +1296,10 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    vl1r.v v15, (a5)
 ; RV64-NEXT:    vl1r.v v12, (a6)
 ; RV64-NEXT:    vl1r.v v13, (a1)
-; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vs2r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
@@ -1324,26 +1324,26 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    slli a2, a1, 2
 ; ZVBB-RV32-NEXT:    add a1, a2, a1
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV32-NEXT:    vsseg5e8.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV32-NEXT:    vsseg5e8.v v17, (a1)
 ; ZVBB-RV32-NEXT:    vl1r.v v16, (a6)
@@ -1362,10 +1362,10 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    vl1r.v v15, (a5)
 ; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV32-NEXT:    vl1r.v v13, (a1)
-; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
@@ -1390,26 +1390,26 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    slli a2, a1, 2
 ; ZVBB-RV64-NEXT:    add a1, a2, a1
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV64-NEXT:    vsseg5e8.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV64-NEXT:    vsseg5e8.v v17, (a1)
 ; ZVBB-RV64-NEXT:    vl1r.v v16, (a6)
@@ -1428,10 +1428,10 @@ define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    vl1r.v v15, (a5)
 ; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV64-NEXT:    vl1r.v v13, (a1)
-; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
@@ -1521,26 +1521,26 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a2, a1, 2
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv2r.v v22, v16
 ; RV32-NEXT:    vmv2r.v v24, v18
 ; RV32-NEXT:    vmv1r.v v26, v20
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vmv1r.v v23, v10
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vmv1r.v v25, v14
 ; RV32-NEXT:    vsseg5e32.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v18, v11
 ; RV32-NEXT:    vmv1r.v v20, v15
 ; RV32-NEXT:    vsseg5e32.v v17, (a1)
 ; RV32-NEXT:    vl1re32.v v16, (a6)
@@ -1559,10 +1559,10 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    vl1re32.v v15, (a5)
 ; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    vl1re32.v v13, (a1)
-; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vs2r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
@@ -1587,26 +1587,26 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 2
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv2r.v v22, v16
 ; RV64-NEXT:    vmv2r.v v24, v18
 ; RV64-NEXT:    vmv1r.v v26, v20
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    vmv1r.v v23, v10
 ; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vmv1r.v v25, v14
 ; RV64-NEXT:    vsseg5e32.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v18, v11
 ; RV64-NEXT:    vmv1r.v v20, v15
 ; RV64-NEXT:    vsseg5e32.v v17, (a1)
 ; RV64-NEXT:    vl1re32.v v16, (a6)
@@ -1625,10 +1625,10 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    vl1re32.v v15, (a5)
 ; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    vl1re32.v v13, (a1)
-; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vs2r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
@@ -1653,26 +1653,26 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    slli a2, a1, 2
 ; ZVBB-RV32-NEXT:    add a1, a2, a1
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV32-NEXT:    vsseg5e32.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV32-NEXT:    vsseg5e32.v v17, (a1)
 ; ZVBB-RV32-NEXT:    vl1re32.v v16, (a6)
@@ -1691,10 +1691,10 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    vl1re32.v v15, (a5)
 ; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    vl1re32.v v13, (a1)
-; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
@@ -1719,26 +1719,26 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    slli a2, a1, 2
 ; ZVBB-RV64-NEXT:    add a1, a2, a1
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV64-NEXT:    vsseg5e32.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV64-NEXT:    vsseg5e32.v v17, (a1)
 ; ZVBB-RV64-NEXT:    vl1re32.v v16, (a6)
@@ -1757,10 +1757,10 @@ define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    vl1re32.v v15, (a5)
 ; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    vl1re32.v v13, (a1)
-; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
@@ -1791,26 +1791,26 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a2, a1, 2
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv2r.v v22, v16
 ; RV32-NEXT:    vmv2r.v v24, v18
 ; RV32-NEXT:    vmv1r.v v26, v20
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vmv1r.v v23, v10
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vmv1r.v v25, v14
 ; RV32-NEXT:    vsseg5e64.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v18, v11
 ; RV32-NEXT:    vmv1r.v v20, v15
 ; RV32-NEXT:    vsseg5e64.v v17, (a1)
 ; RV32-NEXT:    vl1re64.v v16, (a6)
@@ -1829,10 +1829,10 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    vl1re64.v v15, (a5)
 ; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    vl1re64.v v13, (a1)
-; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vs2r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
@@ -1857,26 +1857,26 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a2, a1, 2
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv2r.v v22, v16
 ; RV64-NEXT:    vmv2r.v v24, v18
 ; RV64-NEXT:    vmv1r.v v26, v20
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    vmv1r.v v23, v10
 ; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vmv1r.v v25, v14
 ; RV64-NEXT:    vsseg5e64.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v18, v11
 ; RV64-NEXT:    vmv1r.v v20, v15
 ; RV64-NEXT:    vsseg5e64.v v17, (a1)
 ; RV64-NEXT:    vl1re64.v v16, (a6)
@@ -1895,10 +1895,10 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    vl1re64.v v15, (a5)
 ; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    vl1re64.v v13, (a1)
-; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vs2r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
@@ -1923,26 +1923,26 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    slli a2, a1, 2
 ; ZVBB-RV32-NEXT:    add a1, a2, a1
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV32-NEXT:    vsseg5e64.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV32-NEXT:    vsseg5e64.v v17, (a1)
 ; ZVBB-RV32-NEXT:    vl1re64.v v16, (a6)
@@ -1961,10 +1961,10 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    vl1re64.v v15, (a5)
 ; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    vl1re64.v v13, (a1)
-; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
@@ -1989,26 +1989,26 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    slli a2, a1, 2
 ; ZVBB-RV64-NEXT:    add a1, a2, a1
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
-; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
 ; ZVBB-RV64-NEXT:    vsseg5e64.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
 ; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
 ; ZVBB-RV64-NEXT:    vsseg5e64.v v17, (a1)
 ; ZVBB-RV64-NEXT:    vl1re64.v v16, (a6)
@@ -2027,10 +2027,10 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    vl1re64.v v15, (a5)
 ; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    vl1re64.v v13, (a1)
-; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
@@ -2075,6 +2075,7 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmv1r.v v25, v22
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vmv1r.v v26, v18
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
@@ -2083,7 +2084,6 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v28, v20
 ; CHECK-NEXT:    vmv1r.v v18, v23
-; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vmv1r.v v29, v10
 ; CHECK-NEXT:    vmv1r.v v20, v9
 ; CHECK-NEXT:    vmv1r.v v0, v13
@@ -2169,6 +2169,7 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmv1r.v v25, v22
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
+; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vmv1r.v v26, v18
 ; ZVBB-NEXT:    vmv1r.v v0, v11
 ; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
@@ -2177,7 +2178,6 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v28, v20
 ; ZVBB-NEXT:    vmv1r.v v18, v23
-; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vmv1r.v v29, v10
 ; ZVBB-NEXT:    vmv1r.v v20, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v13
@@ -2252,35 +2252,35 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a2, a0, 3
+; RV32-NEXT:    sub a0, a2, a0
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add a3, a1, a2
+; RV32-NEXT:    add a4, a0, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vsseg7e8.v v1, (a1)
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e8.v v21, (a1)
+; RV32-NEXT:    vsseg7e8.v v21, (a0)
 ; RV32-NEXT:    vl1r.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v11, (a6)
@@ -2292,30 +2292,30 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV32-NEXT:    vl1r.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v19, (a6)
-; RV32-NEXT:    vl1r.v v16, (a0)
+; RV32-NEXT:    vl1r.v v16, (a1)
 ; RV32-NEXT:    vl1r.v v8, (a4)
 ; RV32-NEXT:    vl1r.v v17, (a3)
 ; RV32-NEXT:    vl1r.v v9, (a7)
-; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v21, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    add a2, a1, a2
+; RV32-NEXT:    add a5, a1, a5
 ; RV32-NEXT:    vl1r.v v22, (a6)
-; RV32-NEXT:    vl1r.v v23, (a1)
-; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vl1r.v v23, (a0)
 ; RV32-NEXT:    vs2r.v v12, (a5)
 ; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vl8r.v v16, (a2)
-; RV32-NEXT:    vl8r.v v8, (a0)
+; RV32-NEXT:    vl8r.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -2334,35 +2334,35 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    addi a1, sp, 64
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a2, a0, 3
+; RV64-NEXT:    sub a0, a2, a0
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a3, a1, a2
+; RV64-NEXT:    add a4, a0, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    sub a5, a6, a5
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vsseg7e8.v v1, (a1)
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e8.v v21, (a1)
+; RV64-NEXT:    vsseg7e8.v v21, (a0)
 ; RV64-NEXT:    vl1r.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v11, (a6)
@@ -2374,30 +2374,30 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV64-NEXT:    vl1r.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v19, (a6)
-; RV64-NEXT:    vl1r.v v16, (a0)
+; RV64-NEXT:    vl1r.v v16, (a1)
 ; RV64-NEXT:    vl1r.v v8, (a4)
 ; RV64-NEXT:    vl1r.v v17, (a3)
 ; RV64-NEXT:    vl1r.v v9, (a7)
-; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v21, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    add a2, a1, a2
+; RV64-NEXT:    add a5, a1, a5
 ; RV64-NEXT:    vl1r.v v22, (a6)
-; RV64-NEXT:    vl1r.v v23, (a1)
-; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vl1r.v v23, (a0)
 ; RV64-NEXT:    vs2r.v v12, (a5)
 ; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vl8r.v v16, (a2)
-; RV64-NEXT:    vl8r.v v8, (a0)
+; RV64-NEXT:    vl8r.v v8, (a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2416,35 +2416,35 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    addi a1, sp, 64
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a0, 3
+; ZVBB-RV32-NEXT:    sub a0, a2, a0
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    add a3, a1, a2
+; ZVBB-RV32-NEXT:    add a4, a0, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a1)
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e8.v v21, (a1)
+; ZVBB-RV32-NEXT:    vsseg7e8.v v21, (a0)
 ; ZVBB-RV32-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v11, (a6)
@@ -2456,30 +2456,30 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV32-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vl1r.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl1r.v v8, (a4)
 ; ZVBB-RV32-NEXT:    vl1r.v v17, (a3)
 ; ZVBB-RV32-NEXT:    vl1r.v v9, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    mul a1, a1, a3
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    add a2, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a1, a5
 ; ZVBB-RV32-NEXT:    vl1r.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v23, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vl1r.v v23, (a0)
 ; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl8r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8r.v v8, (a1)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -2498,35 +2498,35 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    addi a1, sp, 64
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a0, 3
+; ZVBB-RV64-NEXT:    sub a0, a2, a0
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    add a3, a1, a2
+; ZVBB-RV64-NEXT:    add a4, a0, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a1)
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e8.v v21, (a1)
+; ZVBB-RV64-NEXT:    vsseg7e8.v v21, (a0)
 ; ZVBB-RV64-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v11, (a6)
@@ -2538,30 +2538,30 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV64-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vl1r.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl1r.v v8, (a4)
 ; ZVBB-RV64-NEXT:    vl1r.v v17, (a3)
 ; ZVBB-RV64-NEXT:    vl1r.v v9, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    mul a1, a1, a3
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    add a2, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a1, a5
 ; ZVBB-RV64-NEXT:    vl1r.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v23, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vl1r.v v23, (a0)
 ; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl8r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8r.v v8, (a1)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2586,35 +2586,35 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a2, a0, 3
+; RV32-NEXT:    sub a0, a2, a0
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add a3, a1, a2
+; RV32-NEXT:    add a4, a0, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vsseg7e16.v v1, (a1)
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e16.v v21, (a1)
+; RV32-NEXT:    vsseg7e16.v v21, (a0)
 ; RV32-NEXT:    vl1re16.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v11, (a6)
@@ -2626,30 +2626,30 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV32-NEXT:    vl1re16.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v19, (a6)
-; RV32-NEXT:    vl1re16.v v16, (a0)
+; RV32-NEXT:    vl1re16.v v16, (a1)
 ; RV32-NEXT:    vl1re16.v v8, (a4)
 ; RV32-NEXT:    vl1re16.v v17, (a3)
 ; RV32-NEXT:    vl1re16.v v9, (a7)
-; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v21, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    add a2, a1, a2
+; RV32-NEXT:    add a5, a1, a5
 ; RV32-NEXT:    vl1re16.v v22, (a6)
-; RV32-NEXT:    vl1re16.v v23, (a1)
-; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vl1re16.v v23, (a0)
 ; RV32-NEXT:    vs2r.v v12, (a5)
 ; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
-; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -2668,35 +2668,35 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    addi a1, sp, 64
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a2, a0, 3
+; RV64-NEXT:    sub a0, a2, a0
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a3, a1, a2
+; RV64-NEXT:    add a4, a0, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    sub a5, a6, a5
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vsseg7e16.v v1, (a1)
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e16.v v21, (a1)
+; RV64-NEXT:    vsseg7e16.v v21, (a0)
 ; RV64-NEXT:    vl1re16.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v11, (a6)
@@ -2708,30 +2708,30 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV64-NEXT:    vl1re16.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v19, (a6)
-; RV64-NEXT:    vl1re16.v v16, (a0)
+; RV64-NEXT:    vl1re16.v v16, (a1)
 ; RV64-NEXT:    vl1re16.v v8, (a4)
 ; RV64-NEXT:    vl1re16.v v17, (a3)
 ; RV64-NEXT:    vl1re16.v v9, (a7)
-; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v21, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    add a2, a1, a2
+; RV64-NEXT:    add a5, a1, a5
 ; RV64-NEXT:    vl1re16.v v22, (a6)
-; RV64-NEXT:    vl1re16.v v23, (a1)
-; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vl1re16.v v23, (a0)
 ; RV64-NEXT:    vs2r.v v12, (a5)
 ; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
-; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v8, (a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2750,35 +2750,35 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    addi a1, sp, 64
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a0, 3
+; ZVBB-RV32-NEXT:    sub a0, a2, a0
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    add a3, a1, a2
+; ZVBB-RV32-NEXT:    add a4, a0, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a1)
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a0)
 ; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
@@ -2790,30 +2790,30 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v16, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl1re16.v v8, (a4)
 ; ZVBB-RV32-NEXT:    vl1re16.v v17, (a3)
 ; ZVBB-RV32-NEXT:    vl1re16.v v9, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    mul a1, a1, a3
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    add a2, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a1, a5
 ; ZVBB-RV32-NEXT:    vl1re16.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v23, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vl1re16.v v23, (a0)
 ; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a1)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -2832,35 +2832,35 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    addi a1, sp, 64
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a0, 3
+; ZVBB-RV64-NEXT:    sub a0, a2, a0
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    add a3, a1, a2
+; ZVBB-RV64-NEXT:    add a4, a0, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a1)
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a0)
 ; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
@@ -2872,30 +2872,30 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v16, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl1re16.v v8, (a4)
 ; ZVBB-RV64-NEXT:    vl1re16.v v17, (a3)
 ; ZVBB-RV64-NEXT:    vl1re16.v v9, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    mul a1, a1, a3
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    add a2, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a1, a5
 ; ZVBB-RV64-NEXT:    vl1re16.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v23, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vl1re16.v v23, (a0)
 ; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a1)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2920,35 +2920,35 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a2, a0, 3
+; RV32-NEXT:    sub a0, a2, a0
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add a3, a1, a2
+; RV32-NEXT:    add a4, a0, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vsseg7e32.v v1, (a1)
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e32.v v21, (a1)
+; RV32-NEXT:    vsseg7e32.v v21, (a0)
 ; RV32-NEXT:    vl1re32.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v11, (a6)
@@ -2960,30 +2960,30 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    vl1re32.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v19, (a6)
-; RV32-NEXT:    vl1re32.v v16, (a0)
+; RV32-NEXT:    vl1re32.v v16, (a1)
 ; RV32-NEXT:    vl1re32.v v8, (a4)
 ; RV32-NEXT:    vl1re32.v v17, (a3)
 ; RV32-NEXT:    vl1re32.v v9, (a7)
-; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v21, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    add a2, a1, a2
+; RV32-NEXT:    add a5, a1, a5
 ; RV32-NEXT:    vl1re32.v v22, (a6)
-; RV32-NEXT:    vl1re32.v v23, (a1)
-; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vl1re32.v v23, (a0)
 ; RV32-NEXT:    vs2r.v v12, (a5)
 ; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vl8re32.v v16, (a2)
-; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    vl8re32.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -3002,35 +3002,35 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    addi a1, sp, 64
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a2, a0, 3
+; RV64-NEXT:    sub a0, a2, a0
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a3, a1, a2
+; RV64-NEXT:    add a4, a0, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    sub a5, a6, a5
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vsseg7e32.v v1, (a1)
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e32.v v21, (a1)
+; RV64-NEXT:    vsseg7e32.v v21, (a0)
 ; RV64-NEXT:    vl1re32.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v11, (a6)
@@ -3042,30 +3042,30 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    vl1re32.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v19, (a6)
-; RV64-NEXT:    vl1re32.v v16, (a0)
+; RV64-NEXT:    vl1re32.v v16, (a1)
 ; RV64-NEXT:    vl1re32.v v8, (a4)
 ; RV64-NEXT:    vl1re32.v v17, (a3)
 ; RV64-NEXT:    vl1re32.v v9, (a7)
-; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v21, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    add a2, a1, a2
+; RV64-NEXT:    add a5, a1, a5
 ; RV64-NEXT:    vl1re32.v v22, (a6)
-; RV64-NEXT:    vl1re32.v v23, (a1)
-; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vl1re32.v v23, (a0)
 ; RV64-NEXT:    vs2r.v v12, (a5)
 ; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vl8re32.v v16, (a2)
-; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    vl8re32.v v8, (a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -3084,35 +3084,35 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    addi a1, sp, 64
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a0, 3
+; ZVBB-RV32-NEXT:    sub a0, a2, a0
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    add a3, a1, a2
+; ZVBB-RV32-NEXT:    add a4, a0, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a1)
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a0)
 ; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
@@ -3124,30 +3124,30 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v16, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl1re32.v v8, (a4)
 ; ZVBB-RV32-NEXT:    vl1re32.v v17, (a3)
 ; ZVBB-RV32-NEXT:    vl1re32.v v9, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    mul a1, a1, a3
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    add a2, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a1, a5
 ; ZVBB-RV32-NEXT:    vl1re32.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v23, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vl1re32.v v23, (a0)
 ; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a1)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -3166,35 +3166,35 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    addi a1, sp, 64
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a0, 3
+; ZVBB-RV64-NEXT:    sub a0, a2, a0
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    add a3, a1, a2
+; ZVBB-RV64-NEXT:    add a4, a0, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a1)
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a0)
 ; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
@@ -3206,30 +3206,30 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v16, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl1re32.v v8, (a4)
 ; ZVBB-RV64-NEXT:    vl1re32.v v17, (a3)
 ; ZVBB-RV64-NEXT:    vl1re32.v v9, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    mul a1, a1, a3
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    add a2, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a1, a5
 ; ZVBB-RV64-NEXT:    vl1re32.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v23, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vl1re32.v v23, (a0)
 ; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a1)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -3253,35 +3253,35 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a2, a0, 3
+; RV32-NEXT:    sub a0, a2, a0
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add a3, a1, a2
+; RV32-NEXT:    add a4, a0, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vsseg7e64.v v1, (a1)
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e64.v v21, (a1)
+; RV32-NEXT:    vsseg7e64.v v21, (a0)
 ; RV32-NEXT:    vl1re64.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v11, (a6)
@@ -3293,30 +3293,30 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    vl1re64.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v19, (a6)
-; RV32-NEXT:    vl1re64.v v16, (a0)
+; RV32-NEXT:    vl1re64.v v16, (a1)
 ; RV32-NEXT:    vl1re64.v v8, (a4)
 ; RV32-NEXT:    vl1re64.v v17, (a3)
 ; RV32-NEXT:    vl1re64.v v9, (a7)
-; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v21, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    add a2, a1, a2
+; RV32-NEXT:    add a5, a1, a5
 ; RV32-NEXT:    vl1re64.v v22, (a6)
-; RV32-NEXT:    vl1re64.v v23, (a1)
-; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vl1re64.v v23, (a0)
 ; RV32-NEXT:    vs2r.v v12, (a5)
 ; RV32-NEXT:    vs4r.v v8, (a2)
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vl8re64.v v16, (a2)
-; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    vl8re64.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -3335,35 +3335,35 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    addi a1, sp, 64
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a2, a0, 3
+; RV64-NEXT:    sub a0, a2, a0
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a3, a1, a2
+; RV64-NEXT:    add a4, a0, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    sub a5, a6, a5
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vsseg7e64.v v1, (a1)
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e64.v v21, (a1)
+; RV64-NEXT:    vsseg7e64.v v21, (a0)
 ; RV64-NEXT:    vl1re64.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v11, (a6)
@@ -3375,30 +3375,30 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    vl1re64.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v19, (a6)
-; RV64-NEXT:    vl1re64.v v16, (a0)
+; RV64-NEXT:    vl1re64.v v16, (a1)
 ; RV64-NEXT:    vl1re64.v v8, (a4)
 ; RV64-NEXT:    vl1re64.v v17, (a3)
 ; RV64-NEXT:    vl1re64.v v9, (a7)
-; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    mul a1, a1, a3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v21, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    add a2, a1, a2
+; RV64-NEXT:    add a5, a1, a5
 ; RV64-NEXT:    vl1re64.v v22, (a6)
-; RV64-NEXT:    vl1re64.v v23, (a1)
-; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vl1re64.v v23, (a0)
 ; RV64-NEXT:    vs2r.v v12, (a5)
 ; RV64-NEXT:    vs4r.v v8, (a2)
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vl8re64.v v16, (a2)
-; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -3417,35 +3417,35 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    addi a1, sp, 64
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a0, 3
+; ZVBB-RV32-NEXT:    sub a0, a2, a0
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    add a3, a1, a2
+; ZVBB-RV32-NEXT:    add a4, a0, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a1)
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a0)
 ; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
@@ -3457,30 +3457,30 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v16, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl1re64.v v8, (a4)
 ; ZVBB-RV32-NEXT:    vl1re64.v v17, (a3)
 ; ZVBB-RV32-NEXT:    vl1re64.v v9, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    mul a1, a1, a3
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    add a2, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a1, a5
 ; ZVBB-RV32-NEXT:    vl1re64.v v22, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v23, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vl1re64.v v23, (a0)
 ; ZVBB-RV32-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a1)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
@@ -3499,35 +3499,35 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    addi a1, sp, 64
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a0, 3
+; ZVBB-RV64-NEXT:    sub a0, a2, a0
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    add a3, a1, a2
+; ZVBB-RV64-NEXT:    add a4, a0, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a1)
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a0)
 ; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
@@ -3539,30 +3539,30 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v16, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl1re64.v v8, (a4)
 ; ZVBB-RV64-NEXT:    vl1re64.v v17, (a3)
 ; ZVBB-RV64-NEXT:    vl1re64.v v9, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    mul a1, a1, a3
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    add a2, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a1, a5
 ; ZVBB-RV64-NEXT:    vl1re64.v v22, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v23, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vl1re64.v v23, (a0)
 ; ZVBB-RV64-NEXT:    vs2r.v v12, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v8, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v16, (a1)
 ; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a1)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll
index 35bed86d61176..1a16bff76284d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll
@@ -29,11 +29,11 @@ define <vscale x 1 x double> @test2(<vscale x 1 x double> %a, <vscale x 1 x i1>
 ; CHECK-NEXT:    lui a1, %hi(.LCPI1_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI1_0)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI1_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI1_1)(a1)
-; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa5
+; CHECK-NEXT:    fld fa5, %lo(.LCPI1_1)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa4, v0.t
+; CHECK-NEXT:    vfadd.vf v9, v9, fa5, v0.t
 ; CHECK-NEXT:    vfmul.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %t = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> splat (double 2.0), <vscale x 1 x i1> %m, i32 %evl)
@@ -48,11 +48,11 @@ define <vscale x 1 x double> @test3(<vscale x 1 x double> %a, <vscale x 1 x doub
 ; CHECK-NEXT:    lui a1, %hi(.LCPI2_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI2_0)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI2_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI2_1)(a1)
-; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa5
+; CHECK-NEXT:    fld fa5, %lo(.LCPI2_1)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vfmul.vf v10, v10, fa4, v0.t
+; CHECK-NEXT:    vfmul.vf v10, v10, fa5, v0.t
 ; CHECK-NEXT:    vfmadd.vv v10, v8, v9, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index 5ee5d40d8313d..4316d5cb403dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -3576,24 +3576,17 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -3601,96 +3594,46 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
-; CHECK-NEXT:    add a7, a2, a5
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    add a3, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a4, a6
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a6
 ; CHECK-NEXT:    add a5, a0, a5
-; CHECK-NEXT:    vl8re64.v v8, (a7)
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 3
-; CHECK-NEXT:    add a7, sp, a7
-; CHECK-NEXT:    addi a7, a7, 16
-; CHECK-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    mv t0, a5
-; CHECK-NEXT:    slli a5, a5, 2
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    and a0, a7, a6
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a1, .LBB128_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB128_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, a0, a1
@@ -3706,80 +3649,28 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a5, a2, a3
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a5)
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    sltu a3, a4, a5
-; CHECK-NEXT:    vl8re64.v v8, (a2)
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a5
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v16, v0
+; CHECK-NEXT:    vl8re64.v v0, (a2)
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    bltu a4, a1, .LBB129_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB129_2:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v0
+; CHECK-NEXT:    vfmadd.vv v16, v8, v0
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.fma.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 901f3cd63fa9e..432994de33321 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1108,20 +1108,15 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1129,86 +1124,46 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
-; CHECK-NEXT:    add a7, a2, a5
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    add a3, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a4, a6
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a6
 ; CHECK-NEXT:    add a5, a0, a5
-; CHECK-NEXT:    vl8re64.v v8, (a7)
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 3
-; CHECK-NEXT:    add a7, sp, a7
-; CHECK-NEXT:    addi a7, a7, 16
-; CHECK-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li t0, 40
-; CHECK-NEXT:    mul a5, a5, t0
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    and a0, a7, a6
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 40
-; CHECK-NEXT:    mul a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a1, .LBB92_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB92_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -1222,76 +1177,28 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a5, a2, a3
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a5)
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    sltu a3, a4, a5
-; CHECK-NEXT:    vl8re64.v v8, (a2)
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a5
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v16, v0
+; CHECK-NEXT:    vl8re64.v v0, (a2)
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    bltu a4, a1, .LBB93_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB93_2:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v16, v8
-; CHECK-NEXT:    vmv.v.v v8, v0
+; CHECK-NEXT:    vfmadd.vv v16, v8, v0
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.fmuladd.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 63156e1399293..6f4d2dd626bfb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -149,69 +149,68 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a5, a1, 2
-; CHECK-NEXT:    slli a6, a1, 3
-; CHECK-NEXT:    slli a4, a1, 1
-; CHECK-NEXT:    vslidedown.vx v16, v0, a5
-; CHECK-NEXT:    add a6, a0, a6
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v24, (a6)
-; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    srli a4, a1, 2
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    vslidedown.vx v16, v0, a4
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    sub a4, a2, a3
+; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    sltu a5, a2, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a4, a5, a4
+; CHECK-NEXT:    sub a5, a4, a1
+; CHECK-NEXT:    sltu a6, a4, a5
 ; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    sub a6, a5, a1
-; CHECK-NEXT:    sltu a7, a5, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v16, a3
-; CHECK-NEXT:    and a0, a7, a6
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    and a6, a6, a5
+; CHECK-NEXT:    srli a5, a1, 3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v16, a5
+; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
-; CHECK-NEXT:    bltu a5, a1, .LBB8_2
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    bltu a4, a1, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a5, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB8_2:
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
-; CHECK-NEXT:    bltu a2, a4, .LBB8_4
+; CHECK-NEXT:    vslidedown.vx v6, v7, a5
+; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT:    vfncvt.f.f.w v16, v24, v0.t
+; CHECK-NEXT:    bltu a2, a3, .LBB8_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a4
+; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB8_4:
 ; CHECK-NEXT:    sub a0, a2, a1
 ; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v28, v8, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB8_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB8_6:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 4336b27eb134a..3ace3ccdf0ee4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -175,8 +175,8 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    sltu a4, a0, a3
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -210,9 +210,9 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    sltu a4, a0, a3
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v16, a2
 ; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v16, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -462,8 +462,8 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    sltu a4, a0, a3
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -503,9 +503,9 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    sltu a4, a0, a3
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 053f1209cf214..86f692daa6885 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -2601,9 +2601,9 @@ define <vscale x 1 x i32> @vmand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmand.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmand_mm:
@@ -2611,9 +2611,9 @@ define <vscale x 1 x i32> @vmand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmand.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2628,9 +2628,9 @@ define <vscale x 1 x i32> @vmnand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmnand.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmnand_mm:
@@ -2638,9 +2638,9 @@ define <vscale x 1 x i32> @vmnand_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmnand.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmnand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2655,9 +2655,9 @@ define <vscale x 1 x i32> @vmandn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmandn.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmandn_mm:
@@ -2665,9 +2665,9 @@ define <vscale x 1 x i32> @vmandn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmandn.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmandn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2682,9 +2682,9 @@ define <vscale x 1 x i32> @vmxor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmxor.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmxor_mm:
@@ -2692,9 +2692,9 @@ define <vscale x 1 x i32> @vmxor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmxor.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmxor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2709,9 +2709,9 @@ define <vscale x 1 x i32> @vmor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <
 ; NOVLOPT-NEXT:    vmor.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmor_mm:
@@ -2719,9 +2719,9 @@ define <vscale x 1 x i32> @vmor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, <
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmor.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2737,9 +2737,9 @@ define <vscale x 1 x i32> @vmnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmnor.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmnor_mm:
@@ -2747,9 +2747,9 @@ define <vscale x 1 x i32> @vmnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmnor.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2764,9 +2764,9 @@ define <vscale x 1 x i32> @vmorn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmorn.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmorn_mm:
@@ -2774,9 +2774,9 @@ define <vscale x 1 x i32> @vmorn_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmorn.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmorn.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
@@ -2791,9 +2791,9 @@ define <vscale x 1 x i32> @vmxnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; NOVLOPT-NEXT:    vmxnor.mm v8, v0, v8
 ; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmand.mm v0, v0, v8
-; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; NOVLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; NOVLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; NOVLOPT-NEXT:    vmv1r.v v8, v9
 ; NOVLOPT-NEXT:    ret
 ;
 ; VLOPT-LABEL: vmxnor_mm:
@@ -2801,9 +2801,9 @@ define <vscale x 1 x i32> @vmxnor_mm(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b,
 ; VLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; VLOPT-NEXT:    vmxnor.mm v8, v0, v8
 ; VLOPT-NEXT:    vmand.mm v0, v0, v8
-; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
-; VLOPT-NEXT:    vadd.vv v8, v9, v9, v0.t
+; VLOPT-NEXT:    vadd.vv v9, v9, v9, v0.t
+; VLOPT-NEXT:    vmv1r.v v8, v9
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 1 x i1> @llvm.riscv.vmxnor.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, iXLen -1)
   %2 = call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %1, iXLen %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 3df0763fdc757..1b3dd35910522 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -1077,12 +1077,11 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx16(<vscale x 32 x i32> %va, i
 ; RV64-LABEL: vmax_vx_nxv32i32_evl_nx16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmax.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetivli zero, 0, e32, m8, ta, ma
 ; RV64-NEXT:    vmax.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 8147d467be04e..df7f177681f5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -1076,12 +1076,11 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx16(<vscale x 32 x i32> %va,
 ; RV64-LABEL: vmaxu_vx_nxv32i32_evl_nx16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmaxu.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmaxu.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetivli zero, 0, e32, m8, ta, ma
 ; RV64-NEXT:    vmaxu.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 614bd4cbde9ec..342c037371b57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -1077,12 +1077,11 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx16(<vscale x 32 x i32> %va, i
 ; RV64-LABEL: vmin_vx_nxv32i32_evl_nx16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmin.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmin.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetivli zero, 0, e32, m8, ta, ma
 ; RV64-NEXT:    vmin.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 21160553af59d..6821aa6c7e380 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -1076,12 +1076,11 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx16(<vscale x 32 x i32> %va,
 ; RV64-LABEL: vminu_vx_nxv32i32_evl_nx16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vminu.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vminu.vx v8, v8, a0, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetivli zero, 0, e32, m8, ta, ma
 ; RV64-NEXT:    vminu.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
index 6407f39a65e8b..275f96d1d526c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmseq_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmseq.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmseq_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmseq.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmseq_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmseq.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
index 45e3840f7e673..2c1a525220eea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
@@ -1725,12 +1725,12 @@ define <vscale x 1 x i1> @intrinsic_vmsge_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsle.vv v10, v11, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1800,12 +1800,12 @@ define <vscale x 2 x i1> @intrinsic_vmsge_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsle.vv v11, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1875,12 +1875,12 @@ define <vscale x 4 x i1> @intrinsic_vmsge_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsle.vv v13, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
@@ -2872,12 +2872,12 @@ define <vscale x 2 x i1> @intrinsic_vmsge_maskedoff_mask_vx_nxv2i64_i64(<vscale
 ; RV32-LABEL: intrinsic_vmsge_maskedoff_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmsle.vv v10, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2904,12 +2904,12 @@ define <vscale x 4 x i1> @intrinsic_vmsge_maskedoff_mask_vx_nxv4i64_i64(<vscale
 ; RV32-LABEL: intrinsic_vmsge_maskedoff_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vmsle.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
index d3f57d58c7ab7..32d8af51e461d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
@@ -1686,12 +1686,12 @@ define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsleu.vv v10, v11, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1761,12 +1761,12 @@ define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsleu.vv v11, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1836,12 +1836,12 @@ define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsleu.vv v13, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
@@ -2851,12 +2851,12 @@ define <vscale x 2 x i1> @intrinsic_vmsgeu_maskedoff_mask_vx_nxv2i64_i64(<vscale
 ; RV32-LABEL: intrinsic_vmsgeu_maskedoff_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmsleu.vv v10, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2883,12 +2883,12 @@ define <vscale x 4 x i1> @intrinsic_vmsgeu_maskedoff_mask_vx_nxv4i64_i64(<vscale
 ; RV32-LABEL: intrinsic_vmsgeu_maskedoff_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vmsleu.vv v12, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
index 62ac44bfdf38c..a84d52b2c03f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmslt.vv v10, v11, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmslt.vv v11, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmslt.vv v13, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
index d57b9cd5bae53..f67d2ed047ae7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsltu.vv v10, v11, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsltu.vv v11, v12, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsltu.vv v13, v16, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
index 9653dfd2518d8..6aed4286c3495 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsle_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsle.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsle_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsle.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsle_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsle.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
index 25ecfa65c7c48..d881b12d7c1e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsleu.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsleu.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsleu.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
index c17495e3b2119..26c3493dd03ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmslt_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmslt.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmslt_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmslt.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmslt_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmslt.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
index a37a02848365d..2d4795b5b8d30 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsltu.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsltu.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsltu.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
index ed41a18dcc8d3..9d43267f511e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
@@ -1670,12 +1670,12 @@ define <vscale x 1 x i1> @intrinsic_vmsne_mask_vx_nxv1i64_i64(<vscale x 1 x i1>
 ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv1i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vlse64.v v11, (a0), zero
-; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmsne.vv v10, v8, v11, v0.t
 ; RV32-NEXT:    vmv.v.v v0, v10
@@ -1744,12 +1744,12 @@ define <vscale x 2 x i1> @intrinsic_vmsne_mask_vx_nxv2i64_i64(<vscale x 2 x i1>
 ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv2i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmv1r.v v11, v0
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vmsne.vv v11, v8, v12, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v11
@@ -1818,12 +1818,12 @@ define <vscale x 4 x i1> @intrinsic_vmsne_mask_vx_nxv4i64_i64(<vscale x 4 x i1>
 ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv4i64_i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vmv1r.v v13, v0
 ; RV32-NEXT:    vmv1r.v v0, v12
 ; RV32-NEXT:    vmsne.vv v13, v8, v16, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
index 4629db26ca034..647960a404d4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
@@ -248,8 +248,8 @@ define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i6
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    vid.v v9
-; RV32-NEXT:    vmseq.vi v0, v9, 0
 ; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vmseq.vi v0, v9, 0
 ; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -274,8 +274,8 @@ define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i6
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
 ; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vmseq.vi v0, v10, 0
 ; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vmseq.vi v0, v10, 0
 ; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -300,8 +300,8 @@ define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i6
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
 ; RV32-NEXT:    vid.v v12
-; RV32-NEXT:    vmseq.vi v0, v12, 0
 ; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vmseq.vi v0, v12, 0
 ; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -326,8 +326,8 @@ define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i6
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
 ; RV32-NEXT:    vid.v v16
-; RV32-NEXT:    vmseq.vi v0, v16, 0
 ; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vmseq.vi v0, v16, 0
 ; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
index a2466c48b0ab7..622f7dfebec9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll
@@ -65,9 +65,10 @@ define void @test_different_evl(<vscale x 2 x float> %val, <vscale x 2 x float>*
 ; CHECK-NEXT:    vrsub.vx v11, v11, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vv v9, v8, v11
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v9, (a0), v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll
index b316f5f878816..1c3f2ed6f81b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll
@@ -169,18 +169,18 @@ define i1 @nxv2i32_cmp_evl(<vscale x 2 x i32> %src, <vscale x 2 x i1> %m, i32 %e
 ;
 ; RV64-LABEL: nxv2i32_cmp_evl:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    sext.w a1, a0
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
 ; RV64-NEXT:    vfirst.m a2, v8, v0.t
-; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    bltz a2, .LBB6_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:  .LBB6_2:
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.cttz.elts.i32.nxv2i32(<vscale x 2 x i32> %src, i1 0, <vscale x 2 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index e481891dfd52f..2214523c58e5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -331,8 +331,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v8, v0
+; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    li a2, -1
+; RV32-NEXT:    li a1, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    csrr a3, vlenb
@@ -340,20 +341,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32-NEXT:    vmerge.vim v11, v9, 1, v0
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v11, v11
-; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    vwmaccu.vx v12, a1, v11
+; RV32-NEXT:    add a1, a3, a3
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v11, v12, a3
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v11, 0
-; RV32-NEXT:    add a2, a3, a3
 ; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vx v10, v9, a3
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a1, 32
@@ -383,19 +383,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v11, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    add a1, a3, a3
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vx v11, v12, a3
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v11, 0
-; RV64-NEXT:    add a1, a3, a3
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
 ; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vx v10, v9, a3
-; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-NEXT:    srli a1, a4, 32
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
 ; RV64-NEXT:    li a1, 32
@@ -676,6 +676,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v9, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
@@ -688,19 +689,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v9, v11
 ; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    add a2, a3, a3
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v9, v12, a3
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
-; RV32-NEXT:    add a2, a3, a3
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vx v10, v8, a3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a0, 32
@@ -725,21 +725,21 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    vmv1r.v v0, v9
 ; RV64-NEXT:    vmerge.vim v9, v8, 1, v0
 ; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vwaddu.vv v12, v9, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    add a2, a3, a3
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vx v9, v12, a3
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
-; RV64-NEXT:    add a2, a3, a3
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vx v10, v8, a3
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
 ; RV64-NEXT:    li a0, 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index 1007d1ce649cc..eacc9b329fba3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -2435,11 +2435,11 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    srli a3, a1, 3
-; RV32-NEXT:    vslidedown.vx v0, v0, a3
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (zero), v12, v0.t
 ; RV32-NEXT:    bltu a0, a1, .LBB111_2
@@ -2458,11 +2458,11 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    srli a3, a1, 3
-; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sltu a3, a0, a2
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    srli a3, a1, 3
+; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
 ; RV64-NEXT:    bltu a0, a1, .LBB111_2
@@ -2480,10 +2480,10 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v8, a2
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulsu.vx v24, v8, a3
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB112_2
 ; RV32-NEXT:  # %bb.1:
@@ -2495,9 +2495,9 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    srli a2, a2, 3
 ; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2506,21 +2506,20 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v12, v0
-; RV64-NEXT:    vsext.vf4 v16, v10
+; RV64-NEXT:    vsext.vf4 v16, v8
+; RV64-NEXT:    vsext.vf4 v0, v10
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
+; RV64-NEXT:    vsll.vi v16, v0, 3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    srli a4, a2, 3
-; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    sltu a4, a1, a3
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
+; RV64-NEXT:    srli a4, a2, 3
+; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v12, a4
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v24, v8
-; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    bltu a1, a2, .LBB112_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
@@ -2537,10 +2536,10 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v8, a2
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulsu.vx v24, v8, a3
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB113_2
 ; RV32-NEXT:  # %bb.1:
@@ -2552,9 +2551,9 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    srli a2, a2, 3
 ; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2564,20 +2563,19 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsext.vf4 v16, v10
+; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    srli a4, a2, 3
-; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    sltu a4, a1, a3
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
+; RV64-NEXT:    srli a4, a2, 3
+; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v24, v8
-; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    bltu a1, a2, .LBB113_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a2
@@ -2595,10 +2593,10 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulu.vx v24, v8, a2
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulu.vx v24, v8, a3
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB114_2
 ; RV32-NEXT:  # %bb.1:
@@ -2610,19 +2608,19 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    srli a2, a2, 3
 ; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a3, 8
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v8, a2
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV64-NEXT:    vwmulu.vx v24, v8, a3
 ; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:    bltu a1, a2, .LBB114_2
 ; RV64-NEXT:  # %bb.1:
@@ -2634,9 +2632,9 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    srli a2, a2, 3
 ; RV64-NEXT:    sltu a1, a1, a3
 ; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 0844180e49612..b73659e7ce415 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -527,12 +527,12 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    srli a5, a2, 3
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
 ; CHECK-NEXT:    sltu a5, a1, a3
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a3, a5, a3
+; CHECK-NEXT:    srli a5, a2, 3
 ; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    vslidedown.vx v0, v0, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB44_2
@@ -591,9 +591,9 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:  .LBB45_4:
 ; CHECK-NEXT:    slli a5, a3, 4
 ; CHECK-NEXT:    srli a6, a3, 2
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a6
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a5), v0.t
 ; CHECK-NEXT:    bltu a4, a3, .LBB45_6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 4cd77185e6930..cb46f4a918541 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -554,17 +554,9 @@ declare <vscale x 128 x i8> @llvm.vp.merge.nxv128i8(<vscale x 128 x i1>, <vscale
 define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpmerge_vv_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    vmv8r.v v24, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    slli a1, a1, 3
@@ -572,26 +564,19 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    sub a4, a3, a1
 ; CHECK-NEXT:    vl8r.v v16, (a2)
 ; CHECK-NEXT:    sltu a2, a3, a4
-; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl8r.v v24, (a0)
 ; CHECK-NEXT:    bltu a3, a1, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 128 x i8> @llvm.vp.merge.nxv128i8(<vscale x 128 x i1> %m, <vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, i32 %evl)
   ret <vscale x 128 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 2cf6248c17598..9340be684f2cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2268,9 +2268,9 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a0
 ; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (zero), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2289,21 +2289,21 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    slli a3, a1, 3
 ; RV64-NEXT:    add a3, a0, a3
 ; RV64-NEXT:    vl8re64.v v16, (a3)
+; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    vl8re64.v v24, (a0)
-; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB108_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:  .LBB108_2:
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    sub a0, a2, a1
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a0
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2323,10 +2323,10 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vl4re16.v v4, (a1)
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v4, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulsu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB109_2
 ; RV32-NEXT:  # %bb.1:
@@ -2338,9 +2338,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2359,14 +2359,14 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl4re16.v v24, (a1)
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v26
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsext.vf4 v16, v24
 ; RV64-NEXT:    vsll.vi v24, v16, 3
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB109_2
 ; RV64-NEXT:  # %bb.1:
@@ -2378,9 +2378,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
@@ -2406,10 +2406,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vl4re16.v v4, (a1)
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v4, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulsu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB110_2
 ; RV32-NEXT:  # %bb.1:
@@ -2421,9 +2421,9 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2442,14 +2442,14 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl4re16.v v24, (a1)
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v26
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsext.vf4 v16, v24
 ; RV64-NEXT:    vsll.vi v24, v16, 3
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB110_2
 ; RV64-NEXT:  # %bb.1:
@@ -2461,9 +2461,9 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
@@ -2490,10 +2490,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vl4re16.v v4, (a1)
-; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulu.vx v24, v4, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV32-NEXT:    vwmulu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB111_2
 ; RV32-NEXT:  # %bb.1:
@@ -2505,9 +2505,9 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
@@ -2515,10 +2515,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vl4re16.v v4, (a1)
-; RV64-NEXT:    li a3, 8
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v4, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; RV64-NEXT:    vwmulu.vx v24, v4, a3
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB111_2
 ; RV64-NEXT:  # %bb.1:
@@ -2530,9 +2530,9 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 7e7da529bf3d7..5cb4176a1be19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -439,15 +439,15 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
-; CHECK-NEXT:    srli a3, a2, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    srli a2, a2, 3
 ; CHECK-NEXT:    sltu a1, a1, a3
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a3
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -499,21 +499,21 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    sltu a2, a2, a4
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a5, a5, a0
-; CHECK-NEXT:    and a0, a2, a4
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    and a0, a5, a0
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a6), v0.t
+; CHECK-NEXT:    and a0, a2, a4
 ; CHECK-NEXT:    bltu a0, a3, .LBB36_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB36_6:
 ; CHECK-NEXT:    slli a2, a3, 4
 ; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a1), v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index a78130e8f102f..ce4e72a31076c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -1003,13 +1003,13 @@ define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-LABEL: vreduce_fmin_nxv10f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI73_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI73_0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    li a1, 10
-; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v12, v8, v12
 ; CHECK-NEXT:    vfmv.f.s fa0, v12
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll
index 7b460f2c058f8..df0792a68e05a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll
@@ -12,11 +12,11 @@
 define internal void @foo(<vscale x 1 x i16> %v15, <vscale x 1 x i16> %0, <vscale x 1 x i16> %vs12.i.i.i, <vscale x 1 x i16> %1, <vscale x 8 x i8> %v37) {
 ; NOSUBREG-LABEL: foo:
 ; NOSUBREG:       # %bb.0: # %loopIR.preheader.i.i
-; NOSUBREG-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; NOSUBREG-NEXT:    vmv.v.i v9, 0
-; NOSUBREG-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; NOSUBREG-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; NOSUBREG-NEXT:    vmv.v.i v14, 0
-; NOSUBREG-NEXT:    vmv1r.v v8, v9
+; NOSUBREG-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; NOSUBREG-NEXT:    vmv.v.i v9, 0
+; NOSUBREG-NEXT:    vmv.v.i v8, 0
 ; NOSUBREG-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
 ; NOSUBREG-NEXT:    vrgatherei16.vv v8, v9, v14
 ; NOSUBREG-NEXT:  .LBB0_1: # %loopIR3.i.i
@@ -32,11 +32,11 @@ define internal void @foo(<vscale x 1 x i16> %v15, <vscale x 1 x i16> %0, <vscal
 ;
 ; SUBREG-LABEL: foo:
 ; SUBREG:       # %bb.0: # %loopIR.preheader.i.i
-; SUBREG-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; SUBREG-NEXT:    vmv.v.i v9, 0
-; SUBREG-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; SUBREG-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; SUBREG-NEXT:    vmv.v.i v14, 0
-; SUBREG-NEXT:    vmv1r.v v8, v9
+; SUBREG-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; SUBREG-NEXT:    vmv.v.i v9, 0
+; SUBREG-NEXT:    vmv.v.i v8, 0
 ; SUBREG-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
 ; SUBREG-NEXT:    vrgatherei16.vv v8, v9, v14
 ; SUBREG-NEXT:  .LBB0_1: # %loopIR3.i.i
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index 58b6f0253b99a..c1ee747bc2602 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -448,34 +448,34 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v8, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
 ; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v8, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v9
+; FOLDING-NEXT:    vsub.vv v9, v11, v9
+; FOLDING-NEXT:    vmv.v.v v0, v8
 ; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
-; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -496,34 +496,34 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v8, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v8, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv1r.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v9
+; FOLDING-NEXT:    vsub.vv v9, v11, v9
+; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
-; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index be2fc6955294d..cc923d8acd245 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -519,10 +519,10 @@ define void @vselect_legalize_regression(<vscale x 16 x double> %a, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    vmand.mm v7, v0, v24
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a2
-; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d4ebe27420d7b..f53e4357e7b51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -351,55 +351,29 @@ declare <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1>, <vscale
 define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: select_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    slli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sub a5, a2, a1
-; CHECK-NEXT:    vl8re32.v v8, (a4)
-; CHECK-NEXT:    sltu a4, a2, a5
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    vl8re32.v v24, (a4)
+; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
-; CHECK-NEXT:    and a4, a4, a5
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB27_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i32> @llvm.vp.select.nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c, i32 %evl)
   ret <vscale x 32 x i32> %v
@@ -410,55 +384,29 @@ declare i32 @llvm.vscale.i32()
 define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32 x i32> %b, <vscale x 32 x i32> %c) {
 ; CHECK-LABEL: select_evl_nxv32i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    srli a4, a1, 2
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    sub a5, a1, a2
-; CHECK-NEXT:    vl8re32.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a1, a5
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
+; CHECK-NEXT:    sub a4, a1, a2
+; CHECK-NEXT:    vl8re32.v v24, (a3)
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
-; CHECK-NEXT:    and a3, a3, a5
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    bltu a1, a2, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB28_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %evl = call i32 @llvm.vscale.i32()
   %evl0 = mul i32 %evl, 8
@@ -699,54 +647,28 @@ declare <vscale x 16 x double> @llvm.vp.select.nxv16f64(<vscale x 16 x i1>, <vsc
 define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 zeroext %evl) {
 ; CHECK-LABEL: select_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    bltu a2, a1, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB48_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.select.nxv16f64(<vscale x 16 x i1> %a, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 5b577dc0f8df9..f359fbfc63632 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -126,10 +126,10 @@ define <vscale x 1 x double> @test4(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    lui a1, %hi(.LCPI3_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI3_0)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI3_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI3_1)(a1)
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa5
-; CHECK-NEXT:    vfmv.v.f v11, fa4
+; CHECK-NEXT:    fld fa5, %lo(.LCPI3_1)(a1)
+; CHECK-NEXT:    vfmv.v.f v11, fa5
 ; CHECK-NEXT:    vfadd.vv v10, v10, v11
 ; CHECK-NEXT:    lui a1, %hi(scratch)
 ; CHECK-NEXT:    addi a1, a1, %lo(scratch)
@@ -177,9 +177,9 @@ if.end:                                           ; preds = %if.else, %if.then
 define <vscale x 1 x double> @test5(i64 %avl, i8 zeroext %cond, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andi a2, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    bnez a2, .LBB4_3
+; CHECK-NEXT:    andi a0, a1, 1
+; CHECK-NEXT:    bnez a0, .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %if.else
 ; CHECK-NEXT:    vfsub.vv v9, v8, v9
 ; CHECK-NEXT:    andi a1, a1, 2
@@ -234,8 +234,8 @@ if.end6:                                          ; preds = %if.else5, %if.then4
 define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x double> %a, <vscale x 1 x double> %b) nounwind {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andi a2, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    andi a2, a1, 1
 ; CHECK-NEXT:    bnez a2, .LBB5_3
 ; CHECK-NEXT:  # %bb.1: # %if.else
 ; CHECK-NEXT:    vfsub.vv v8, v8, v9
@@ -245,9 +245,9 @@ define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI5_1)
-; CHECK-NEXT:    fld fa4, %lo(.LCPI5_1)(a1)
 ; CHECK-NEXT:    vfmv.v.f v9, fa5
-; CHECK-NEXT:    vfmv.v.f v10, fa4
+; CHECK-NEXT:    fld fa5, %lo(.LCPI5_1)(a1)
+; CHECK-NEXT:    vfmv.v.f v10, fa5
 ; CHECK-NEXT:    vfadd.vv v9, v9, v10
 ; CHECK-NEXT:    lui a1, %hi(scratch)
 ; CHECK-NEXT:    addi a1, a1, %lo(scratch)
@@ -446,15 +446,15 @@ define void @saxpy_vec(i64 %n, float %a, ptr nocapture readonly %x, ptr nocaptur
 ; CHECK-NEXT:  .LBB8_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
 ; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
 ; CHECK-NEXT:    vse32.v v16, (a2)
-; CHECK-NEXT:    vsetvli a3, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    vsetvli a3, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    bnez a3, .LBB8_1
 ; CHECK-NEXT:  .LBB8_2: # %for.end
 ; CHECK-NEXT:    ret
@@ -494,15 +494,15 @@ define void @saxpy_vec_demanded_fields(i64 %n, float %a, ptr nocapture readonly
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
 ; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
 ; CHECK-NEXT:    vse32.v v16, (a2)
-; CHECK-NEXT:    vsetvli a3, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    vsetvli a3, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    bnez a3, .LBB9_1
 ; CHECK-NEXT:  .LBB9_2: # %for.end
 ; CHECK-NEXT:    ret
@@ -544,9 +544,9 @@ declare void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float>, ptr nocapture,
 define <vscale x 2 x i32> @test_vsetvli_x0_x0(ptr %x, ptr %y, <vscale x 2 x i32> %z, i64 %vl, i1 %cond) nounwind {
 ; CHECK-LABEL: test_vsetvli_x0_x0:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a3, a3, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    andi a3, a3, 1
 ; CHECK-NEXT:    beqz a3, .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
@@ -583,9 +583,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32(<vscale x 2 x i32>, <vscale
 define <vscale x 2 x i32> @test_vsetvli_x0_x0_2(ptr %x, ptr %y, ptr %z, i64 %vl, i1 %cond, i1 %cond2, <vscale x 2 x i32> %w) nounwind {
 ; CHECK-LABEL: test_vsetvli_x0_x0_2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a4, a4, 1
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    andi a4, a4, 1
 ; CHECK-NEXT:    beqz a4, .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 8b48dc43eca29..fd690bb31f716 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -109,13 +109,13 @@ define void @test6(ptr nocapture readonly %A, ptr nocapture %B, i64 %n) {
 ; CHECK-NEXT:  .LBB5_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    add a5, a0, a4
+; CHECK-NEXT:    add a4, a4, a1
 ; CHECK-NEXT:    vle32.v v8, (a5)
-; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    vmsle.vi v9, v8, -3
 ; CHECK-NEXT:    vmsgt.vi v10, v8, 2
 ; CHECK-NEXT:    vmor.mm v0, v9, v10
-; CHECK-NEXT:    add a4, a4, a1
 ; CHECK-NEXT:    vse32.v v8, (a4), v0.t
 ; CHECK-NEXT:    vsetvli a2, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    bnez a2, .LBB5_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
index c3b19b59ec3d6..f658a2c6b24a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
@@ -11,10 +11,9 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, pt
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vadd.vv v12, v12, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    vs4r.v v12, (a0)
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
   %index = add <vscale x 4 x i64> %x, %x
   store <vscale x 4 x i64> %index, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index fd5bf4ebcede8..de12e23345f08 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -290,69 +290,68 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a5, a1, 2
-; CHECK-NEXT:    slli a6, a1, 3
-; CHECK-NEXT:    slli a4, a1, 1
-; CHECK-NEXT:    vslidedown.vx v16, v0, a5
-; CHECK-NEXT:    add a6, a0, a6
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v24, (a6)
-; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    srli a4, a1, 2
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    vslidedown.vx v16, v0, a4
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    sub a4, a2, a3
+; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    sltu a5, a2, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a4, a5, a4
+; CHECK-NEXT:    sub a5, a4, a1
+; CHECK-NEXT:    sltu a6, a4, a5
 ; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    sub a6, a5, a1
-; CHECK-NEXT:    sltu a7, a5, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v16, a3
-; CHECK-NEXT:    and a0, a7, a6
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    and a6, a6, a5
+; CHECK-NEXT:    srli a5, a1, 3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v16, a5
+; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v20, v24, 0, v0.t
-; CHECK-NEXT:    bltu a5, a1, .LBB17_2
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    bltu a4, a1, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a5, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB17_2:
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    bltu a2, a4, .LBB17_4
+; CHECK-NEXT:    vslidedown.vx v6, v7, a5
+; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT:    bltu a2, a3, .LBB17_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a4
+; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB17_4:
 ; CHECK-NEXT:    sub a0, a2, a1
 ; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v12, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB17_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB17_6:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v24, v8, 0, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
index 02af09f028fc1..8337edcaffb35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
@@ -41,8 +41,8 @@ define <vscale x 8 x i64> @vwaddu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vwaddu.vv v16, v8, v12
 ; CHECK-NEXT:    vmv8r.v v8, v16
@@ -77,8 +77,8 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v12, 1
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwadd.wv v8, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index ddc27f7562cdb..336d86d57f3e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -1377,9 +1377,9 @@ define <vscale x 1 x i64> @i1_zext(<vscale x 1 x i1> %va, <vscale x 1 x i64> %vb
 ;
 ; RV64-LABEL: i1_zext:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 42
-; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
 ; RV64-NEXT:    vadd.vi v8, v8, 1, v0.t
+; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:    sh a1, 0(a0)
 ; RV64-NEXT:    ret
   %vc = zext <vscale x 1 x i1> %va to <vscale x 1 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
index 04ece9d94880c..dcbb1a88d3731 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
@@ -41,8 +41,8 @@ define <vscale x 8 x i64> @vwsubu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vwsubu.vv v16, v12, v8
 ; CHECK-NEXT:    vmv8r.v v8, v16
@@ -60,8 +60,8 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmv.v.i v12, 1
+; CHECK-NEXT:    vmslt.vx v0, v8, a0
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwsub.wv v8, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index c35f05be304cc..5e2e316b16ddd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -34,10 +34,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
 ; RV32-NEXT:    add t3, a0, t3
 ; RV32-NEXT:    add t4, a2, t4
 ; RV32-NEXT:    add s0, a4, t5
@@ -75,8 +73,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    addi t0, t0, 1
 ; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    addi t0, t0, 1
 ; RV32-NEXT:    beq t0, a7, .LBB0_16
 ; RV32-NEXT:  .LBB0_10: # %for.cond1.preheader.us
 ; RV32-NEXT:    # =>This Loop Header: Depth=1
@@ -102,17 +100,17 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    add s0, a2, t6
 ; RV32-NEXT:    add s1, a4, t6
 ; RV32-NEXT:    vl2r.v v8, (s0)
-; RV32-NEXT:    add s0, a0, t6
+; RV32-NEXT:    add s0, t6, t2
 ; RV32-NEXT:    vl2r.v v10, (s1)
-; RV32-NEXT:    add s1, t6, t2
-; RV32-NEXT:    sltu t6, s1, t6
-; RV32-NEXT:    add t5, t5, t6
-; RV32-NEXT:    xor t6, s1, t4
+; RV32-NEXT:    sltu s1, s0, t6
+; RV32-NEXT:    add t5, t5, s1
+; RV32-NEXT:    add t6, a0, t6
 ; RV32-NEXT:    vaaddu.vv v8, v8, v10
-; RV32-NEXT:    or s2, t6, t5
-; RV32-NEXT:    vs2r.v v8, (s0)
-; RV32-NEXT:    mv t6, s1
-; RV32-NEXT:    bnez s2, .LBB0_13
+; RV32-NEXT:    vs2r.v v8, (t6)
+; RV32-NEXT:    xor t6, s0, t4
+; RV32-NEXT:    or s1, t6, t5
+; RV32-NEXT:    mv t6, s0
+; RV32-NEXT:    bnez s1, .LBB0_13
 ; RV32-NEXT:  # %bb.14: # %middle.block
 ; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
 ; RV32-NEXT:    beq t4, a6, .LBB0_9
@@ -121,27 +119,25 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    # => This Inner Loop Header: Depth=2
 ; RV32-NEXT:    add t5, a2, t4
 ; RV32-NEXT:    add t6, a4, t4
-; RV32-NEXT:    add s0, a0, t4
 ; RV32-NEXT:    lbu t5, 0(t5)
 ; RV32-NEXT:    lbu t6, 0(t6)
-; RV32-NEXT:    addi t4, t4, 1
-; RV32-NEXT:    seqz s1, t4
-; RV32-NEXT:    add t3, t3, s1
 ; RV32-NEXT:    add t5, t5, t6
-; RV32-NEXT:    xor t6, t4, a6
+; RV32-NEXT:    add t6, a0, t4
+; RV32-NEXT:    addi t4, t4, 1
 ; RV32-NEXT:    addi t5, t5, 1
 ; RV32-NEXT:    srli t5, t5, 1
-; RV32-NEXT:    or t6, t6, t3
-; RV32-NEXT:    sb t5, 0(s0)
-; RV32-NEXT:    bnez t6, .LBB0_15
+; RV32-NEXT:    sb t5, 0(t6)
+; RV32-NEXT:    seqz t5, t4
+; RV32-NEXT:    xor t6, t4, a6
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    or t5, t6, t3
+; RV32-NEXT:    bnez t5, .LBB0_15
 ; RV32-NEXT:    j .LBB0_9
 ; RV32-NEXT:  .LBB0_16:
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:  .LBB0_17: # %for.cond.cleanup
@@ -436,16 +432,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    add s0, a2, a6
 ; RV64-NEXT:    add t6, a4, a6
 ; RV64-NEXT:    csrr t0, vlenb
-; RV64-NEXT:    li t2, 32
-; RV64-NEXT:    slli t1, t1, 32
-; RV64-NEXT:    srli t3, t1, 32
-; RV64-NEXT:    mul t1, a1, t3
-; RV64-NEXT:    add t5, t5, t1
-; RV64-NEXT:    mul t1, a3, t3
-; RV64-NEXT:    add s0, s0, t1
+; RV64-NEXT:    slli t2, t1, 32
 ; RV64-NEXT:    slli t1, t0, 1
-; RV64-NEXT:    mul t3, a5, t3
-; RV64-NEXT:    add t6, t6, t3
+; RV64-NEXT:    srli t2, t2, 32
+; RV64-NEXT:    mul t3, a1, t2
+; RV64-NEXT:    add t5, t5, t3
+; RV64-NEXT:    mul t3, a3, t2
+; RV64-NEXT:    mul t2, a5, t2
+; RV64-NEXT:    add s0, s0, t3
+; RV64-NEXT:    add t6, t6, t2
+; RV64-NEXT:    li t2, 32
 ; RV64-NEXT:    mv t4, t1
 ; RV64-NEXT:    bltu t2, t1, .LBB0_4
 ; RV64-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
index 72f25268109a1..ce344bd7553fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll
@@ -393,8 +393,8 @@ define void @test10(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, pt
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vle8.v v9, (a2)
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
-; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    bnez a3, .LBB9_2
 ; CHECK-NEXT:  .LBB9_3: # %for.end
 ; CHECK-NEXT:    ret
@@ -432,8 +432,8 @@ define void @test11(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, pt
 ; CHECK-NEXT:  .LBB10_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vaadd.vv v8, v8, v9
-; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    beqz a3, .LBB10_3
 ; CHECK-NEXT:  # %bb.2: # %for.body
 ; CHECK-NEXT:    # in Loop: Header=BB10_1 Depth=1
diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll
index 5872a0995feba..f94c5635032a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll
@@ -8,10 +8,10 @@ define void @do.memmove() nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(c)
 ; CHECK-NEXT:    addi a0, a0, %lo(c)
-; CHECK-NEXT:    addi a1, a0, 16
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 24
+; CHECK-NEXT:    addi a2, a0, 16
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 8
diff --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll
index a212714db53e0..8a0baa67d0293 100644
--- a/llvm/test/CodeGen/RISCV/scmp.ll
+++ b/llvm/test/CodeGen/RISCV/scmp.ll
@@ -89,15 +89,15 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a6, 12(a0)
 ; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a7, 8(a0)
-; RV32I-NEXT:    beq a6, a5, .LBB4_2
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a7, 12(a0)
+; RV32I-NEXT:    beq a7, a5, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t2, a6, a5
+; RV32I-NEXT:    slt t2, a7, a5
 ; RV32I-NEXT:    j .LBB4_3
 ; RV32I-NEXT:  .LBB4_2:
-; RV32I-NEXT:    sltu t2, a7, a4
+; RV32I-NEXT:    sltu t2, a6, a4
 ; RV32I-NEXT:  .LBB4_3:
 ; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    lw t0, 0(a0)
@@ -108,23 +108,23 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-NEXT:  .LBB4_5:
 ; RV32I-NEXT:    sltu a0, t0, a1
 ; RV32I-NEXT:  .LBB4_6:
-; RV32I-NEXT:    xor t1, a6, a5
-; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    xor t1, a7, a5
+; RV32I-NEXT:    xor t3, a6, a4
 ; RV32I-NEXT:    or t1, t3, t1
 ; RV32I-NEXT:    beqz t1, .LBB4_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a0, t2
 ; RV32I-NEXT:  .LBB4_8:
-; RV32I-NEXT:    beq a6, a5, .LBB4_11
+; RV32I-NEXT:    beq a7, a5, .LBB4_11
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    slt a4, a5, a6
+; RV32I-NEXT:    slt a4, a5, a7
 ; RV32I-NEXT:    bne a3, a2, .LBB4_12
 ; RV32I-NEXT:  .LBB4_10:
 ; RV32I-NEXT:    sltu a1, a1, t0
 ; RV32I-NEXT:    bnez t1, .LBB4_13
 ; RV32I-NEXT:    j .LBB4_14
 ; RV32I-NEXT:  .LBB4_11:
-; RV32I-NEXT:    sltu a4, a4, a7
+; RV32I-NEXT:    sltu a4, a4, a6
 ; RV32I-NEXT:    beq a3, a2, .LBB4_10
 ; RV32I-NEXT:  .LBB4_12:
 ; RV32I-NEXT:    sltu a1, a2, a3
diff --git a/llvm/test/CodeGen/RISCV/select-and.ll b/llvm/test/CodeGen/RISCV/select-and.ll
index f827e840f4a36..01965a2da23f8 100644
--- a/llvm/test/CodeGen/RISCV/select-and.ll
+++ b/llvm/test/CodeGen/RISCV/select-and.ll
@@ -12,22 +12,22 @@
 define signext i32 @select_of_and(i1 zeroext %a, i1 zeroext %b, i32 signext %c, i32 signext %d) nounwind {
 ; RV32I-LABEL: select_of_and:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a2, a3
 ; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: select_of_and:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    and a1, a0, a1
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    bnez a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    mv a2, a3
 ; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV64I-CCMOV-LABEL: select_of_and:
diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll
index c9e108a1ca9d0..ab03b1a684730 100644
--- a/llvm/test/CodeGen/RISCV/select-bare.ll
+++ b/llvm/test/CodeGen/RISCV/select-bare.ll
@@ -7,12 +7,12 @@
 define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 ; RV32I-LABEL: bare_select:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a3, a0, 1
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bnez a3, .LBB0_2
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    bnez a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-CCMOV-LABEL: bare_select:
@@ -27,12 +27,12 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 define float @bare_select_float(i1 %a, float %b, float %c) nounwind {
 ; RV32I-LABEL: bare_select_float:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a3, a0, 1
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bnez a3, .LBB1_2
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    bnez a0, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-CCMOV-LABEL: bare_select_float:
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index 1c2a0cf007d11..568fea4df4acc 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -163,48 +163,48 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ; RV64I-CCMOV:       # %bb.0:
 ; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
 ; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
-; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
-; RV64I-CCMOV-NEXT:    lw a5, 0(a1)
-; RV64I-CCMOV-NEXT:    xor a6, a0, a2
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a6, a2, a0
-; RV64I-CCMOV-NEXT:    xor a2, a0, a3
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a2, a0, a3
+; RV64I-CCMOV-NEXT:    xor a4, a0, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a2, a0
 ; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
-; RV64I-CCMOV-NEXT:    sltu a3, a4, a0
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a3, a0, a4
+; RV64I-CCMOV-NEXT:    xor a4, a0, a3
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a3
 ; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
-; RV64I-CCMOV-NEXT:    sltu a4, a0, a5
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a5, a0
-; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
-; RV64I-CCMOV-NEXT:    sltu a5, a0, a2
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a2
+; RV64I-CCMOV-NEXT:    sltu a4, a2, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a2
 ; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
-; RV64I-CCMOV-NEXT:    sltu a5, a3, a0
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a3, a0
+; RV64I-CCMOV-NEXT:    sltu a4, a0, a3
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
 ; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
-; RV64I-CCMOV-NEXT:    sext.w a5, a0
-; RV64I-CCMOV-NEXT:    slt a5, a4, a5
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a4
-; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
-; RV64I-CCMOV-NEXT:    sext.w a5, a0
-; RV64I-CCMOV-NEXT:    slt a5, a5, a2
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a2, a0
+; RV64I-CCMOV-NEXT:    sltu a4, a0, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a2
 ; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
-; RV64I-CCMOV-NEXT:    sext.w a5, a0
-; RV64I-CCMOV-NEXT:    slt a5, a5, a3
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a3
+; RV64I-CCMOV-NEXT:    sltu a4, a3, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a4, a0
+; RV64I-CCMOV-NEXT:    slt a4, a2, a4
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a2
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a4, a0
+; RV64I-CCMOV-NEXT:    slt a4, a4, a3
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a4, a0
+; RV64I-CCMOV-NEXT:    slt a4, a4, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a2
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a4, a0
+; RV64I-CCMOV-NEXT:    slt a4, a3, a4
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    slti a4, a2, 1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a0, a2
+; RV64I-CCMOV-NEXT:    slti a4, a2, 0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
 ; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
-; RV64I-CCMOV-NEXT:    sext.w a5, a0
-; RV64I-CCMOV-NEXT:    slt a5, a4, a5
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a4, a0
-; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
-; RV64I-CCMOV-NEXT:    slti a5, a2, 1
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a2
-; RV64I-CCMOV-NEXT:    slti a5, a2, 0
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a3, a0
 ; RV64I-CCMOV-NEXT:    lw a1, 0(a1)
-; RV64I-CCMOV-NEXT:    slti a3, a4, 1025
-; RV64I-CCMOV-NEXT:    mips.ccmov a0, a3, a4, a0
+; RV64I-CCMOV-NEXT:    slti a4, a3, 1025
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a3, a0
 ; RV64I-CCMOV-NEXT:    sltiu a2, a2, 2047
 ; RV64I-CCMOV-NEXT:    mips.ccmov a0, a2, a1, a0
 ; RV64I-CCMOV-NEXT:    sext.w a0, a0
diff --git a/llvm/test/CodeGen/RISCV/select-constant-xor.ll b/llvm/test/CodeGen/RISCV/select-constant-xor.ll
index 2e26ae78e2dd8..254ff96ef5648 100644
--- a/llvm/test/CodeGen/RISCV/select-constant-xor.ll
+++ b/llvm/test/CodeGen/RISCV/select-constant-xor.ll
@@ -172,12 +172,12 @@ define i32 @icmpasreq(i32 %input, i32 %a, i32 %b) {
 ;
 ; RV64-LABEL: icmpasreq:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bltz a3, .LBB8_2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bltz a0, .LBB8_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB8_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %sh = ashr i32 %input, 31
   %c = icmp eq i32 %sh, -1
@@ -197,12 +197,12 @@ define i32 @icmpasrne(i32 %input, i32 %a, i32 %b) {
 ;
 ; RV64-LABEL: icmpasrne:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    bgez a3, .LBB9_2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bgez a0, .LBB9_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB9_2:
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %sh = ashr i32 %input, 31
   %c = icmp ne i32 %sh, -1
diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
index 005a01bf1000a..3020e61fd6985 100644
--- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
+++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
@@ -96,24 +96,24 @@ entry:
 define i64 @cmov64(i1 %a, i64 %b, i64 %c) nounwind {
 ; RV32I-LABEL: cmov64:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a5, a0, 1
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    bnez a5, .LBB2_2
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    bnez a0, .LBB2_2
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: cmov64:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a3, a0, 1
-; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    bnez a3, .LBB2_2
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    bnez a0, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    mv a1, a2
 ; RV64I-NEXT:  .LBB2_2: # %entry
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 entry:
   %cond = select i1 %a, i64 %b, i64 %c
@@ -161,13 +161,13 @@ define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind {
 ;
 ; RV64I-LABEL: cmov128:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a5, a0, 1
-; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    bnez a5, .LBB3_2
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    bnez a0, .LBB3_2
 ; RV64I-NEXT:  # %bb.1: # %entry
-; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    mv a2, a4
 ; RV64I-NEXT:  .LBB3_2: # %entry
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, a2
 ; RV64I-NEXT:    ret
 entry:
@@ -221,9 +221,9 @@ define double @cmovdouble(i1 %a, double %b, double %c) nounwind {
 ; RV32I-NEXT:    sw a3, 8(sp)
 ; RV32I-NEXT:    sw a4, 12(sp)
 ; RV32I-NEXT:    fld fa5, 8(sp)
-; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    sw a1, 8(sp)
 ; RV32I-NEXT:    sw a2, 12(sp)
+; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    beqz a0, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    fld fa5, 8(sp)
@@ -301,8 +301,8 @@ entry:
 define i32 @cmovdiffcc(i1 %a, i1 %b, i32 %c, i32 %d, i32 %e, i32 %f) nounwind {
 ; RV32I-LABEL: cmovdiffcc:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    andi a1, a1, 1
+; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    beqz a0, .LBB7_3
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    beqz a1, .LBB7_4
@@ -318,8 +318,8 @@ define i32 @cmovdiffcc(i1 %a, i1 %b, i32 %c, i32 %d, i32 %e, i32 %f) nounwind {
 ;
 ; RV64I-LABEL: cmovdiffcc:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    andi a0, a0, 1
 ; RV64I-NEXT:    andi a1, a1, 1
+; RV64I-NEXT:    andi a0, a0, 1
 ; RV64I-NEXT:    beqz a0, .LBB7_3
 ; RV64I-NEXT:  # %bb.1: # %entry
 ; RV64I-NEXT:    beqz a1, .LBB7_4
diff --git a/llvm/test/CodeGen/RISCV/select-or.ll b/llvm/test/CodeGen/RISCV/select-or.ll
index 338c7c06c3ab8..b1ed06ad5b8cf 100644
--- a/llvm/test/CodeGen/RISCV/select-or.ll
+++ b/llvm/test/CodeGen/RISCV/select-or.ll
@@ -12,22 +12,22 @@
 define signext i32 @select_of_or(i1 zeroext %a, i1 zeroext %b, i32 signext %c, i32 signext %d) nounwind {
 ; RV32I-LABEL: select_of_or:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    bnez a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a2, a3
 ; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: select_of_or:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    mv a0, a2
-; RV64I-NEXT:    bnez a1, .LBB0_2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    bnez a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    mv a2, a3
 ; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV64I-CCMOV-LABEL: select_of_or:
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index e0a16aa05cd00..cb8fddd71e08c 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -269,8 +269,8 @@ define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:  .LBB5_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    call baz
-; CHECK-NEXT:    feq.s a1, fa0, fs0
 ; CHECK-NEXT:    fcvt.w.s a0, fa0, rtz
+; CHECK-NEXT:    feq.s a1, fa0, fs0
 ; CHECK-NEXT:    beqz a1, .LBB5_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -289,8 +289,8 @@ define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
 ; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NOREMOVAL-NEXT:    sext.w a0, a0
 ; NOREMOVAL-NEXT:    call baz
-; NOREMOVAL-NEXT:    feq.s a1, fa0, fs0
 ; NOREMOVAL-NEXT:    fcvt.w.s a0, fa0, rtz
+; NOREMOVAL-NEXT:    feq.s a1, fa0, fs0
 ; NOREMOVAL-NEXT:    beqz a1, .LBB5_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -526,8 +526,8 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:  .LBB9_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    call baz
-; CHECK-NEXT:    feq.s a1, fa0, fs0
 ; CHECK-NEXT:    fmv.x.w a0, fa0
+; CHECK-NEXT:    feq.s a1, fa0, fs0
 ; CHECK-NEXT:    beqz a1, .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -546,8 +546,8 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
 ; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NOREMOVAL-NEXT:    sext.w a0, a0
 ; NOREMOVAL-NEXT:    call baz
-; NOREMOVAL-NEXT:    feq.s a1, fa0, fs0
 ; NOREMOVAL-NEXT:    fmv.x.w a0, fa0
+; NOREMOVAL-NEXT:    feq.s a1, fa0, fs0
 ; NOREMOVAL-NEXT:    beqz a1, .LBB9_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -578,8 +578,8 @@ define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-NEXT:  .LBB10_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    andi a0, a0, 1234
-; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    bltu a2, a3, .LBB10_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    ret
@@ -591,8 +591,8 @@ define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; NOREMOVAL-NEXT:  .LBB10_1: # %bb2
 ; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NOREMOVAL-NEXT:    andi a0, a0, 1234
-; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    bltu a2, a3, .LBB10_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    sext.w a0, a0
@@ -626,8 +626,8 @@ define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-NEXT:    mulw a2, a0, a1
 ; CHECK-NEXT:    addw a0, a0, a2
 ; CHECK-NEXT:    and a2, a2, a0
-; CHECK-NEXT:    addi a3, a3, 1
 ; CHECK-NEXT:    add a0, a2, a1
+; CHECK-NEXT:    addi a3, a3, 1
 ; CHECK-NEXT:    bltu a3, a4, .LBB11_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    mv a0, a2
@@ -643,8 +643,8 @@ define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; NOREMOVAL-NEXT:    mul a4, a0, a1
 ; NOREMOVAL-NEXT:    add a0, a0, a4
 ; NOREMOVAL-NEXT:    and a4, a4, a0
-; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    add a0, a4, a1
+; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    bltu a2, a3, .LBB11_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    sext.w a0, a4
@@ -678,8 +678,8 @@ define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; CHECK-NEXT:  .LBB12_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    div a0, a0, a1
-; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    bltu a2, a3, .LBB12_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    sext.w a0, a0
@@ -692,8 +692,8 @@ define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
 ; NOREMOVAL-NEXT:  .LBB12_1: # %bb2
 ; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NOREMOVAL-NEXT:    div a0, a0, a1
-; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    bltu a2, a3, .LBB12_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    sext.w a0, a0
@@ -989,8 +989,8 @@ define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    andi a0, a0, 1234
 ; CHECK-NEXT:    addw a0, a0, a1
-; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    sw a0, 0(a3)
+; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    bltu a2, a4, .LBB17_1
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    ret
@@ -1003,8 +1003,8 @@ define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
 ; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NOREMOVAL-NEXT:    andi a0, a0, 1234
 ; NOREMOVAL-NEXT:    add a0, a0, a1
-; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    sw a0, 0(a3)
+; NOREMOVAL-NEXT:    addi a2, a2, 1
 ; NOREMOVAL-NEXT:    bltu a2, a4, .LBB17_1
 ; NOREMOVAL-NEXT:  # %bb.2: # %bb7
 ; NOREMOVAL-NEXT:    sext.w a0, a0
diff --git a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
index 1e893d9baa494..40806c5ecdf48 100644
--- a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
@@ -141,10 +141,9 @@ define i64 @ashr_by_complemented_64(i64 %x) {
 ; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    not a2, a4
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    or a3, a3, a1
+; RV32I-NEXT:    sll a2, a1, a2
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    or a0, a3, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr_by_complemented_64:
@@ -178,25 +177,25 @@ define i32 @shl_by_masked_complemented_32(i32 %x) {
 define i64 @shl_by_masked_complemented_64(i64 %x) {
 ; RV32I-LABEL: shl_by_masked_complemented_64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a2, 63
-; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    andi a4, a2, 63
-; RV32I-NEXT:    addi a2, a4, -32
-; RV32I-NEXT:    not a3, a0
-; RV32I-NEXT:    bltz a2, .LBB7_2
+; RV32I-NEXT:    not a2, a0
+; RV32I-NEXT:    li a3, 63
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    andi a4, a3, 63
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    bltz a3, .LBB7_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sll a1, a0, a4
 ; RV32I-NEXT:    j .LBB7_3
 ; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    sll a1, a1, a3
+; RV32I-NEXT:    sll a1, a1, a2
 ; RV32I-NEXT:    not a4, a4
 ; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    srl a4, a5, a4
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:  .LBB7_3:
-; RV32I-NEXT:    sll a0, a0, a3
-; RV32I-NEXT:    srai a2, a2, 31
-; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    and a0, a3, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: shl_by_masked_complemented_64:
@@ -213,25 +212,25 @@ define i64 @shl_by_masked_complemented_64(i64 %x) {
 define i64 @lshr_by_masked_complemented_64(i64 %x) {
 ; RV32I-LABEL: lshr_by_masked_complemented_64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    li a2, 63
-; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    andi a4, a2, 63
-; RV32I-NEXT:    addi a2, a4, -32
-; RV32I-NEXT:    not a3, a0
-; RV32I-NEXT:    bltz a2, .LBB8_2
+; RV32I-NEXT:    not a2, a0
+; RV32I-NEXT:    li a3, 63
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    andi a4, a3, 63
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    bltz a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srl a0, a1, a4
 ; RV32I-NEXT:    j .LBB8_3
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    srl a0, a0, a3
+; RV32I-NEXT:    srl a0, a0, a2
 ; RV32I-NEXT:    not a4, a4
 ; RV32I-NEXT:    slli a5, a1, 1
 ; RV32I-NEXT:    sll a4, a5, a4
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:  .LBB8_3:
-; RV32I-NEXT:    srl a1, a1, a3
-; RV32I-NEXT:    srai a2, a2, 31
-; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    and a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: lshr_by_masked_complemented_64:
@@ -250,22 +249,23 @@ define i64 @ashr_by_masked_complemented_64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a2, 63
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    andi a2, a2, 63
-; RV32I-NEXT:    addi a3, a2, -32
-; RV32I-NEXT:    bltz a3, .LBB9_2
+; RV32I-NEXT:    andi a3, a2, 63
+; RV32I-NEXT:    addi a2, a3, -32
+; RV32I-NEXT:    bltz a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    srai a1, a1, 31
-; RV32I-NEXT:    sra a0, a0, a2
+; RV32I-NEXT:    srai a2, a1, 31
+; RV32I-NEXT:    sra a0, a1, a3
+; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    not a3, a0
-; RV32I-NEXT:    not a2, a2
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sra a1, a1, a3
-; RV32I-NEXT:    srl a0, a0, a3
-; RV32I-NEXT:    sll a2, a4, a2
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a4, a0
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    slli a5, a1, 1
+; RV32I-NEXT:    sra a2, a1, a4
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    sll a1, a5, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr_by_masked_complemented_64:
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 249dabba0cc28..fcf34b5612689 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -13,8 +13,8 @@ declare i128 @llvm.fshr.i128(i128, i128, i128)
 define i64 @lshr64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: lshr64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a3
@@ -60,13 +60,12 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: ashr64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a3, a1
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a3, a3, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    srai a1, a3, 31
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB2_2:
 ; RV32I-NEXT:    srl a0, a0, a2
@@ -105,8 +104,8 @@ define i64 @ashr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i64 @shl64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: shl64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    sll a3, a0, a2
+; RV32I-NEXT:    addi a4, a2, -32
 ; RV32I-NEXT:    bltz a4, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a1, a3
@@ -197,8 +196,8 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ;
 ; RV64I-LABEL: lshr128:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    srl a3, a1, a2
+; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    bltz a4, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a3
@@ -268,13 +267,12 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV64I-LABEL: ashr128:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    mv a3, a1
-; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    sra a1, a1, a2
+; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    bltz a4, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    srai a3, a3, 63
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    srai a1, a3, 63
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    srl a0, a0, a2
@@ -308,12 +306,12 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    srli a1, a2, 3
 ; RV32I-NEXT:    andi a3, a2, 31
 ; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sub a1, a6, a1
 ; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sll a7, a5, a2
 ; RV32I-NEXT:    srli t0, a4, 1
 ; RV32I-NEXT:    sll a1, a1, a2
@@ -336,8 +334,8 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ;
 ; RV64I-LABEL: shl128:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    sll a3, a0, a2
+; RV64I-NEXT:    addi a4, a2, -64
 ; RV64I-NEXT:    bltz a4, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a1, a3
@@ -394,21 +392,21 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw t1, 0(a1)
 ; RV32I-NEXT:    lw a7, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    andi t2, a2, 64
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    andi t2, a1, 64
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    mv a3, t1
+; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:    beqz t2, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv a2, a4
 ; RV32I-NEXT:  .LBB10_2:
-; RV32I-NEXT:    andi a6, a2, 32
-; RV32I-NEXT:    mv a5, a3
+; RV32I-NEXT:    andi a6, a1, 32
+; RV32I-NEXT:    mv a5, a2
 ; RV32I-NEXT:    bnez a6, .LBB10_13
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    bnez t2, .LBB10_14
@@ -418,31 +416,31 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-NEXT:    mv t0, a4
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    slli t3, t0, 1
-; RV32I-NEXT:    not t1, a2
+; RV32I-NEXT:    not t1, a1
 ; RV32I-NEXT:    beqz t2, .LBB10_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a1, a7
+; RV32I-NEXT:    mv a3, a7
 ; RV32I-NEXT:  .LBB10_8:
-; RV32I-NEXT:    srl a7, a5, a2
+; RV32I-NEXT:    srl a7, a5, a1
 ; RV32I-NEXT:    sll t2, t3, t1
-; RV32I-NEXT:    srl t0, t0, a2
+; RV32I-NEXT:    srl t0, t0, a1
 ; RV32I-NEXT:    beqz a6, .LBB10_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, a1
+; RV32I-NEXT:    mv a4, a3
 ; RV32I-NEXT:  .LBB10_10:
 ; RV32I-NEXT:    or a7, t2, a7
 ; RV32I-NEXT:    slli t2, a4, 1
 ; RV32I-NEXT:    sll t2, t2, t1
 ; RV32I-NEXT:    or t0, t2, t0
-; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    beqz a6, .LBB10_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    mv a3, a2
 ; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    slli a3, a1, 1
-; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli a2, a3, 1
+; RV32I-NEXT:    srl a1, a3, a1
 ; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    sll a2, a3, t1
+; RV32I-NEXT:    sll a2, a2, t1
 ; RV32I-NEXT:    sll a3, a5, t1
 ; RV32I-NEXT:    or a2, a2, a4
 ; RV32I-NEXT:    or a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
index 500673cc29196..f408011b31456 100644
--- a/llvm/test/CodeGen/RISCV/shl-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -415,20 +415,20 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
 ; RV32I-NEXT:    .cfi_offset ra, -4
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
-; RV32I-NEXT:    neg a2, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    lui a2, 30667
-; RV32I-NEXT:    addi a2, a2, 1329
-; RV32I-NEXT:    mul a1, a1, a2
-; RV32I-NEXT:    srli a1, a1, 27
-; RV32I-NEXT:    lui a2, %hi(.LCPI7_0)
-; RV32I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    lbu s0, 0(a1)
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call use32
-; RV32I-NEXT:    sll a0, s1, s0
+; RV32I-NEXT:    sll a0, s0, s1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -474,20 +474,20 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    negw a2, a1
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    lui a2, 30667
-; RV64I-NEXT:    addi a2, a2, 1329
-; RV64I-NEXT:    mul a1, a1, a2
-; RV64I-NEXT:    srliw a1, a1, 27
-; RV64I-NEXT:    lui a2, %hi(.LCPI7_0)
-; RV64I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
-; RV64I-NEXT:    add a1, a2, a1
-; RV64I-NEXT:    lbu s0, 0(a1)
-; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addi a1, a1, 1329
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu s1, 0(a0)
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call use32
-; RV64I-NEXT:    sllw a0, s1, s0
+; RV64I-NEXT:    sllw a0, s0, s1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -554,8 +554,8 @@ define i64 @shl_cttz_i64(i64 %x, i64 %y) {
 ; RV32I-NEXT:    add a2, a4, a2
 ; RV32I-NEXT:    lbu a4, 0(a2)
 ; RV32I-NEXT:  .LBB8_3: # %entry
-; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    sll a2, a0, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB8_5
 ; RV32I-NEXT:  # %bb.4: # %entry
 ; RV32I-NEXT:    mv a1, a2
@@ -581,8 +581,8 @@ define i64 @shl_cttz_i64(i64 %x, i64 %y) {
 ; RV32ZBB-NEXT:  .LBB8_2:
 ; RV32ZBB-NEXT:    ctz a4, a2
 ; RV32ZBB-NEXT:  .LBB8_3: # %entry
-; RV32ZBB-NEXT:    addi a3, a4, -32
 ; RV32ZBB-NEXT:    sll a2, a0, a4
+; RV32ZBB-NEXT:    addi a3, a4, -32
 ; RV32ZBB-NEXT:    bltz a3, .LBB8_5
 ; RV32ZBB-NEXT:  # %bb.4: # %entry
 ; RV32ZBB-NEXT:    mv a1, a2
@@ -642,8 +642,8 @@ define i64 @shl_cttz_constant_i64(i64 %y) {
 ; RV32I-NEXT:    lbu a1, 0(a0)
 ; RV32I-NEXT:  .LBB9_3: # %entry
 ; RV32I-NEXT:    li a0, 4
-; RV32I-NEXT:    addi a2, a1, -32
 ; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    addi a2, a1, -32
 ; RV32I-NEXT:    bltz a2, .LBB9_5
 ; RV32I-NEXT:  # %bb.4: # %entry
 ; RV32I-NEXT:    mv a1, a0
@@ -668,8 +668,8 @@ define i64 @shl_cttz_constant_i64(i64 %y) {
 ; RV32ZBB-NEXT:    ctz a1, a0
 ; RV32ZBB-NEXT:  .LBB9_3: # %entry
 ; RV32ZBB-NEXT:    li a0, 4
-; RV32ZBB-NEXT:    addi a2, a1, -32
 ; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    addi a2, a1, -32
 ; RV32ZBB-NEXT:    bltz a2, .LBB9_5
 ; RV32ZBB-NEXT:  # %bb.4: # %entry
 ; RV32ZBB-NEXT:    mv a1, a0
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index 8f5b044c3b3b8..6d14c0d76a45c 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -56,10 +56,10 @@ define void @test2(ptr %sp, ptr %t, i32 %n) {
 ; RV32I-LABEL: test2:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    li a3, 0
-; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    lui a4, 20
 ; RV32I-NEXT:    addi a4, a4, -1920
 ; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    add a0, a0, a4
 ; RV32I-NEXT:    blez a2, .LBB1_2
 ; RV32I-NEXT:  .LBB1_1: # %while_body
@@ -77,8 +77,8 @@ define void @test2(ptr %sp, ptr %t, i32 %n) {
 ; RV64I-LABEL: test2:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    li a3, 0
-; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    lui a4, 20
+; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    addiw a4, a4, -1920
 ; RV64I-NEXT:    add a1, a1, a4
 ; RV64I-NEXT:    add a0, a0, a4
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 42c87c9660dc9..e3aeae4df2be1 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -304,24 +304,24 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a1, 12(a0)
-; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 4(a0)
 ; RV32-NEXT:    lw a0, 0(a0)
-; RV32-NEXT:    slli a4, a1, 30
+; RV32-NEXT:    lw a1, 4(s0)
+; RV32-NEXT:    lw a2, 8(s0)
+; RV32-NEXT:    lbu a3, 12(s0)
+; RV32-NEXT:    slli a4, a3, 30
 ; RV32-NEXT:    srli s1, a2, 2
 ; RV32-NEXT:    slli a5, a2, 31
 ; RV32-NEXT:    or s1, s1, a4
-; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    srli a4, a1, 1
 ; RV32-NEXT:    or s2, a4, a5
-; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    srli a2, a2, 1
-; RV32-NEXT:    slli a3, a3, 31
 ; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    slli a3, a3, 31
 ; RV32-NEXT:    slli a2, a2, 31
-; RV32-NEXT:    srai s3, a1, 31
+; RV32-NEXT:    srai s3, a3, 31
 ; RV32-NEXT:    srai s4, a2, 31
-; RV32-NEXT:    srai a1, a3, 31
+; RV32-NEXT:    srai a1, a1, 31
 ; RV32-NEXT:    li a2, 6
 ; RV32-NEXT:    li a3, 0
 ; RV32-NEXT:    call __moddi3
@@ -383,19 +383,19 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    lbu a0, 12(a0)
-; RV64-NEXT:    ld a1, 0(s0)
-; RV64-NEXT:    lwu a2, 8(s0)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    srli a3, a1, 2
-; RV64-NEXT:    or a0, a2, a0
-; RV64-NEXT:    slli a2, a2, 62
-; RV64-NEXT:    slli a1, a1, 31
-; RV64-NEXT:    or a2, a2, a3
-; RV64-NEXT:    slli s1, a0, 29
-; RV64-NEXT:    srai a0, a2, 31
-; RV64-NEXT:    srai s1, s1, 31
-; RV64-NEXT:    srai s2, a1, 31
+; RV64-NEXT:    ld a0, 0(a0)
+; RV64-NEXT:    lwu a1, 8(s0)
+; RV64-NEXT:    lbu a2, 12(s0)
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a1, a2
+; RV64-NEXT:    slli a1, a1, 62
+; RV64-NEXT:    slli a4, a0, 31
+; RV64-NEXT:    or a0, a1, a3
+; RV64-NEXT:    slli a2, a2, 29
+; RV64-NEXT:    srai a0, a0, 31
+; RV64-NEXT:    srai s1, a2, 31
+; RV64-NEXT:    srai s2, a4, 31
 ; RV64-NEXT:    li a1, 7
 ; RV64-NEXT:    call __moddi3
 ; RV64-NEXT:    mv s3, a0
@@ -456,24 +456,24 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    mv s0, a0
-; RV32M-NEXT:    lbu a1, 12(a0)
-; RV32M-NEXT:    lw a2, 8(a0)
-; RV32M-NEXT:    lw a3, 4(a0)
 ; RV32M-NEXT:    lw a0, 0(a0)
-; RV32M-NEXT:    slli a4, a1, 30
+; RV32M-NEXT:    lw a1, 4(s0)
+; RV32M-NEXT:    lw a2, 8(s0)
+; RV32M-NEXT:    lbu a3, 12(s0)
+; RV32M-NEXT:    slli a4, a3, 30
 ; RV32M-NEXT:    srli s1, a2, 2
 ; RV32M-NEXT:    slli a5, a2, 31
 ; RV32M-NEXT:    or s1, s1, a4
-; RV32M-NEXT:    srli a4, a3, 1
+; RV32M-NEXT:    srli a4, a1, 1
 ; RV32M-NEXT:    or s2, a4, a5
-; RV32M-NEXT:    srli a1, a1, 2
+; RV32M-NEXT:    srli a3, a3, 2
 ; RV32M-NEXT:    srli a2, a2, 1
-; RV32M-NEXT:    slli a3, a3, 31
 ; RV32M-NEXT:    slli a1, a1, 31
+; RV32M-NEXT:    slli a3, a3, 31
 ; RV32M-NEXT:    slli a2, a2, 31
-; RV32M-NEXT:    srai s3, a1, 31
+; RV32M-NEXT:    srai s3, a3, 31
 ; RV32M-NEXT:    srai s4, a2, 31
-; RV32M-NEXT:    srai a1, a3, 31
+; RV32M-NEXT:    srai a1, a1, 31
 ; RV32M-NEXT:    li a2, 6
 ; RV32M-NEXT:    li a3, 0
 ; RV32M-NEXT:    call __moddi3
@@ -606,26 +606,26 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lw a1, 8(a0)
-; RV32MV-NEXT:    lbu a2, 12(a0)
-; RV32MV-NEXT:    lw a3, 4(a0)
 ; RV32MV-NEXT:    lw a0, 0(a0)
+; RV32MV-NEXT:    lw a1, 4(s0)
+; RV32MV-NEXT:    lw a2, 8(s0)
+; RV32MV-NEXT:    lbu a3, 12(s0)
 ; RV32MV-NEXT:    li a4, 1
-; RV32MV-NEXT:    slli a5, a2, 30
-; RV32MV-NEXT:    srli s1, a1, 2
-; RV32MV-NEXT:    slli a6, a1, 31
+; RV32MV-NEXT:    slli a5, a3, 30
+; RV32MV-NEXT:    srli s1, a2, 2
+; RV32MV-NEXT:    slli a6, a2, 31
 ; RV32MV-NEXT:    or s1, s1, a5
-; RV32MV-NEXT:    srli a5, a3, 1
+; RV32MV-NEXT:    srli a5, a1, 1
 ; RV32MV-NEXT:    or s2, a5, a6
 ; RV32MV-NEXT:    li a5, -1
-; RV32MV-NEXT:    srli a2, a2, 2
-; RV32MV-NEXT:    srli a1, a1, 1
+; RV32MV-NEXT:    srli a3, a3, 2
+; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    slli a1, a1, 31
 ; RV32MV-NEXT:    slli a3, a3, 31
 ; RV32MV-NEXT:    slli a2, a2, 31
-; RV32MV-NEXT:    slli a6, a1, 31
-; RV32MV-NEXT:    srai a1, a3, 31
-; RV32MV-NEXT:    srai s3, a2, 31
-; RV32MV-NEXT:    srai s4, a6, 31
+; RV32MV-NEXT:    srai a1, a1, 31
+; RV32MV-NEXT:    srai s3, a3, 31
+; RV32MV-NEXT:    srai s4, a2, 31
 ; RV32MV-NEXT:    sw a5, 16(sp)
 ; RV32MV-NEXT:    sw a4, 20(sp)
 ; RV32MV-NEXT:    li a2, 6
@@ -653,17 +653,18 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    mv a0, s1
 ; RV32MV-NEXT:    mv a1, s3
 ; RV32MV-NEXT:    call __moddi3
-; RV32MV-NEXT:    addi a2, sp, 16
-; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32MV-NEXT:    vlse64.v v8, (a2), zero
 ; RV32MV-NEXT:    addi a2, sp, 32
-; RV32MV-NEXT:    vl2r.v v10, (a2) # Unknown-size Folded Reload
+; RV32MV-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32MV-NEXT:    vslide1down.vx v8, v8, a0
+; RV32MV-NEXT:    addi a0, sp, 16
+; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32MV-NEXT:    vlse64.v v10, (a0), zero
 ; RV32MV-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32MV-NEXT:    vslide1down.vx v10, v10, a0
-; RV32MV-NEXT:    vslide1down.vx v10, v10, a1
-; RV32MV-NEXT:    vslidedown.vi v10, v10, 2
+; RV32MV-NEXT:    vslidedown.vi v8, v8, 2
 ; RV32MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32MV-NEXT:    vand.vv v8, v10, v8
+; RV32MV-NEXT:    vand.vv v8, v8, v10
 ; RV32MV-NEXT:    vsetivli zero, 3, e8, mf2, ta, ma
 ; RV32MV-NEXT:    vmv.v.i v10, 1
 ; RV32MV-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index cf65d4e0cf805..5cb7e1388a08f 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -18,30 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh a2, 0(a1)
-; RV32I-NEXT:    lh s0, 4(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lh a0, 0(a1)
+; RV32I-NEXT:    lh s1, 4(a1)
+; RV32I-NEXT:    lh s2, 8(a1)
+; RV32I-NEXT:    lh s3, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, -124
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    li a1, -1003
+; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    sh s4, 0(s3)
-; RV32I-NEXT:    sh s0, 2(s3)
-; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh a0, 6(s3)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    li a1, -1003
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call __modsi3
+; RV32I-NEXT:    sh s4, 0(s0)
+; RV32I-NEXT:    sh s1, 2(s0)
+; RV32I-NEXT:    sh s2, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -110,30 +109,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh a2, 0(a1)
-; RV64I-NEXT:    lh s0, 8(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lh a0, 0(a1)
+; RV64I-NEXT:    lh s1, 8(a1)
+; RV64I-NEXT:    lh s2, 16(a1)
+; RV64I-NEXT:    lh s3, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, -124
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    li a1, -1003
+; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    sh s4, 0(s3)
-; RV64I-NEXT:    sh s0, 2(s3)
-; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh a0, 6(s3)
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    li a1, -1003
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call __moddi3
+; RV64I-NEXT:    sh s4, 0(s0)
+; RV64I-NEXT:    sh s1, 2(s0)
+; RV64I-NEXT:    sh s2, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -206,30 +204,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh a2, 0(a1)
-; RV32I-NEXT:    lh s0, 4(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lh a0, 0(a1)
+; RV32I-NEXT:    lh s1, 4(a1)
+; RV32I-NEXT:    lh s2, 8(a1)
+; RV32I-NEXT:    lh s3, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    sh s4, 0(s3)
-; RV32I-NEXT:    sh s0, 2(s3)
-; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh a0, 6(s3)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    li a1, 95
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call __modsi3
+; RV32I-NEXT:    sh s4, 0(s0)
+; RV32I-NEXT:    sh s1, 2(s0)
+; RV32I-NEXT:    sh s2, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -291,30 +288,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh a2, 0(a1)
-; RV64I-NEXT:    lh s0, 8(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lh a0, 0(a1)
+; RV64I-NEXT:    lh s1, 8(a1)
+; RV64I-NEXT:    lh s2, 16(a1)
+; RV64I-NEXT:    lh s3, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    sh s4, 0(s3)
-; RV64I-NEXT:    sh s0, 2(s3)
-; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh a0, 6(s3)
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    li a1, 95
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call __moddi3
+; RV64I-NEXT:    sh s4, 0(s0)
+; RV64I-NEXT:    sh s1, 2(s0)
+; RV64I-NEXT:    sh s2, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -326,20 +322,20 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
-; RV64IM-NEXT:    lh a3, 0(a1)
-; RV64IM-NEXT:    lh a4, 8(a1)
-; RV64IM-NEXT:    lh a5, 16(a1)
+; RV64IM-NEXT:    lh a2, 0(a1)
+; RV64IM-NEXT:    lh a3, 8(a1)
+; RV64IM-NEXT:    lh a4, 16(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    mulh a6, a3, a2
-; RV64IM-NEXT:    mulh a7, a4, a2
-; RV64IM-NEXT:    mulh t0, a5, a2
-; RV64IM-NEXT:    mulh a2, a1, a2
-; RV64IM-NEXT:    add a6, a6, a3
-; RV64IM-NEXT:    add a7, a7, a4
-; RV64IM-NEXT:    add t0, t0, a5
-; RV64IM-NEXT:    add a2, a2, a1
+; RV64IM-NEXT:    lui a5, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI1_0)(a5)
+; RV64IM-NEXT:    mulh a6, a2, a5
+; RV64IM-NEXT:    mulh a7, a3, a5
+; RV64IM-NEXT:    mulh t0, a4, a5
+; RV64IM-NEXT:    mulh a5, a1, a5
+; RV64IM-NEXT:    add a6, a6, a2
+; RV64IM-NEXT:    add a7, a7, a3
+; RV64IM-NEXT:    add t0, t0, a4
+; RV64IM-NEXT:    add a5, a5, a1
 ; RV64IM-NEXT:    srli t1, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t1
@@ -349,21 +345,21 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    srli t1, t0, 63
 ; RV64IM-NEXT:    srli t0, t0, 6
 ; RV64IM-NEXT:    add t0, t0, t1
-; RV64IM-NEXT:    srli t1, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, t1
+; RV64IM-NEXT:    srli t1, a5, 63
+; RV64IM-NEXT:    srli a5, a5, 6
+; RV64IM-NEXT:    add a5, a5, t1
 ; RV64IM-NEXT:    li t1, 95
 ; RV64IM-NEXT:    mul a6, a6, t1
 ; RV64IM-NEXT:    mul a7, a7, t1
 ; RV64IM-NEXT:    mul t0, t0, t1
-; RV64IM-NEXT:    mul a2, a2, t1
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a7
-; RV64IM-NEXT:    subw a5, a5, t0
-; RV64IM-NEXT:    subw a1, a1, a2
-; RV64IM-NEXT:    sh a3, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
-; RV64IM-NEXT:    sh a5, 4(a0)
+; RV64IM-NEXT:    mul a5, a5, t1
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a7
+; RV64IM-NEXT:    subw a4, a4, t0
+; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -386,11 +382,11 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lh s1, 0(a1)
 ; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh s3, 8(a1)
 ; RV32I-NEXT:    lh s4, 12(a1)
-; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s4
 ; RV32I-NEXT:    call __modsi3
@@ -503,11 +499,11 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s6, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s7, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s8, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lh s1, 0(a1)
 ; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh s3, 16(a1)
 ; RV64I-NEXT:    lh s4, 24(a1)
-; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s4
 ; RV64I-NEXT:    call __moddi3
@@ -562,49 +558,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: combine_srem_sdiv:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lh a3, 24(a1)
-; RV64IM-NEXT:    lui a4, %hi(.LCPI2_0)
-; RV64IM-NEXT:    ld a4, %lo(.LCPI2_0)(a4)
-; RV64IM-NEXT:    lh a5, 0(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
+; RV64IM-NEXT:    lh a2, 0(a1)
+; RV64IM-NEXT:    lh a3, 8(a1)
+; RV64IM-NEXT:    lh a4, 16(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI2_0)
 ; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    mulh a7, a3, a4
-; RV64IM-NEXT:    mulh t0, a2, a4
-; RV64IM-NEXT:    mulh t1, a1, a4
-; RV64IM-NEXT:    mulh a4, a5, a4
-; RV64IM-NEXT:    add a7, a7, a3
-; RV64IM-NEXT:    add t0, t0, a2
-; RV64IM-NEXT:    add t1, t1, a1
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    ld a5, %lo(.LCPI2_0)(a5)
+; RV64IM-NEXT:    mulh a7, a1, a5
+; RV64IM-NEXT:    mulh t0, a4, a5
+; RV64IM-NEXT:    mulh t1, a3, a5
+; RV64IM-NEXT:    mulh a5, a2, a5
+; RV64IM-NEXT:    add a7, a7, a1
+; RV64IM-NEXT:    add t0, t0, a4
+; RV64IM-NEXT:    add t1, t1, a3
+; RV64IM-NEXT:    add a5, a5, a2
 ; RV64IM-NEXT:    srli t2, a7, 63
 ; RV64IM-NEXT:    srai a7, a7, 6
 ; RV64IM-NEXT:    srli t3, t0, 63
 ; RV64IM-NEXT:    srai t0, t0, 6
 ; RV64IM-NEXT:    srli t4, t1, 63
 ; RV64IM-NEXT:    srai t1, t1, 6
-; RV64IM-NEXT:    srli t5, a4, 63
-; RV64IM-NEXT:    srai a4, a4, 6
+; RV64IM-NEXT:    srli t5, a5, 63
+; RV64IM-NEXT:    srai a5, a5, 6
 ; RV64IM-NEXT:    add a7, a7, t2
 ; RV64IM-NEXT:    add t0, t0, t3
 ; RV64IM-NEXT:    add t1, t1, t4
-; RV64IM-NEXT:    add a4, a4, t5
+; RV64IM-NEXT:    add a5, a5, t5
 ; RV64IM-NEXT:    mul t2, a7, a6
 ; RV64IM-NEXT:    mul t3, t0, a6
 ; RV64IM-NEXT:    mul t4, t1, a6
-; RV64IM-NEXT:    mul a6, a4, a6
-; RV64IM-NEXT:    add a4, a5, a4
-; RV64IM-NEXT:    add a1, a1, t1
-; RV64IM-NEXT:    add a2, a2, t0
-; RV64IM-NEXT:    add a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a6
-; RV64IM-NEXT:    subw a1, a1, t4
-; RV64IM-NEXT:    subw a2, a2, t3
-; RV64IM-NEXT:    subw a3, a3, t2
-; RV64IM-NEXT:    sh a4, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
-; RV64IM-NEXT:    sh a3, 6(a0)
+; RV64IM-NEXT:    mul a6, a5, a6
+; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    add a3, a3, t1
+; RV64IM-NEXT:    add a4, a4, t0
+; RV64IM-NEXT:    add a1, a1, a7
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, t4
+; RV64IM-NEXT:    subw a4, a4, t3
+; RV64IM-NEXT:    subw a1, a1, t2
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -655,36 +651,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 4(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
-; RV32IM-NEXT:    lh a1, 0(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    add a5, a5, a1
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    srli a6, a1, 26
-; RV32IM-NEXT:    add a6, a1, a6
-; RV32IM-NEXT:    andi a6, a6, -64
-; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    srli a6, a2, 27
+; RV32IM-NEXT:    srli a6, a2, 26
 ; RV32IM-NEXT:    add a6, a2, a6
-; RV32IM-NEXT:    andi a6, a6, -32
+; RV32IM-NEXT:    andi a6, a6, -64
 ; RV32IM-NEXT:    sub a2, a2, a6
-; RV32IM-NEXT:    srli a6, a3, 29
+; RV32IM-NEXT:    srli a6, a3, 27
 ; RV32IM-NEXT:    add a6, a3, a6
-; RV32IM-NEXT:    andi a6, a6, -8
+; RV32IM-NEXT:    andi a6, a6, -32
 ; RV32IM-NEXT:    sub a3, a3, a6
+; RV32IM-NEXT:    srli a6, a4, 29
+; RV32IM-NEXT:    add a6, a4, a6
+; RV32IM-NEXT:    andi a6, a6, -8
+; RV32IM-NEXT:    sub a4, a4, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a2, 2(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_power_of_two:
@@ -773,26 +769,25 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh a2, 4(a1)
-; RV32I-NEXT:    lh s0, 8(a1)
-; RV32I-NEXT:    lh s1, 12(a1)
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lh a0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    li a1, 654
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3
-; RV32I-NEXT:    sh zero, 0(s2)
-; RV32I-NEXT:    sh s3, 2(s2)
-; RV32I-NEXT:    sh s0, 4(s2)
-; RV32I-NEXT:    sh a0, 6(s2)
+; RV32I-NEXT:    sh zero, 0(s0)
+; RV32I-NEXT:    sh s3, 2(s0)
+; RV32I-NEXT:    sh s1, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -850,26 +845,25 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh a2, 8(a1)
-; RV64I-NEXT:    lh s0, 16(a1)
-; RV64I-NEXT:    lh s1, 24(a1)
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lh a0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    sh zero, 0(s2)
-; RV64I-NEXT:    sh s3, 2(s2)
-; RV64I-NEXT:    sh s0, 4(s2)
-; RV64I-NEXT:    sh a0, 6(s2)
+; RV64I-NEXT:    sh zero, 0(s0)
+; RV64I-NEXT:    sh s3, 2(s0)
+; RV64I-NEXT:    sh s1, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1036,31 +1030,31 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lh a2, 8(a1)
 ; RV64IM-NEXT:    lh a3, 16(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    lui a4, %hi(.LCPI5_0)
-; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
-; RV64IM-NEXT:    lui a6, 8
-; RV64IM-NEXT:    ld a4, %lo(.LCPI5_0)(a4)
-; RV64IM-NEXT:    srli a7, a2, 49
-; RV64IM-NEXT:    mulh a5, a1, a5
-; RV64IM-NEXT:    add a7, a2, a7
-; RV64IM-NEXT:    and a6, a7, a6
-; RV64IM-NEXT:    srli a7, a5, 63
-; RV64IM-NEXT:    srli a5, a5, 11
-; RV64IM-NEXT:    add a5, a5, a7
-; RV64IM-NEXT:    mulh a4, a3, a4
-; RV64IM-NEXT:    add a4, a4, a3
-; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    lui a4, %hi(.LCPI5_1)
+; RV64IM-NEXT:    lui a5, 8
+; RV64IM-NEXT:    ld a4, %lo(.LCPI5_1)(a4)
+; RV64IM-NEXT:    srli a6, a2, 49
+; RV64IM-NEXT:    mulh a4, a1, a4
+; RV64IM-NEXT:    add a6, a2, a6
+; RV64IM-NEXT:    and a5, a6, a5
 ; RV64IM-NEXT:    srli a6, a4, 63
-; RV64IM-NEXT:    srli a4, a4, 4
+; RV64IM-NEXT:    srli a4, a4, 11
 ; RV64IM-NEXT:    add a4, a4, a6
+; RV64IM-NEXT:    lui a6, %hi(.LCPI5_0)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI5_0)(a6)
+; RV64IM-NEXT:    mulh a6, a3, a6
+; RV64IM-NEXT:    add a6, a6, a3
+; RV64IM-NEXT:    subw a2, a2, a5
+; RV64IM-NEXT:    srli a5, a6, 63
+; RV64IM-NEXT:    srli a6, a6, 4
+; RV64IM-NEXT:    add a5, a6, a5
 ; RV64IM-NEXT:    lui a6, 1
 ; RV64IM-NEXT:    addi a6, a6, 1327
-; RV64IM-NEXT:    mul a5, a5, a6
-; RV64IM-NEXT:    li a6, 23
 ; RV64IM-NEXT:    mul a4, a4, a6
-; RV64IM-NEXT:    subw a1, a1, a5
-; RV64IM-NEXT:    subw a3, a3, a4
+; RV64IM-NEXT:    li a6, 23
+; RV64IM-NEXT:    mul a5, a5, a6
+; RV64IM-NEXT:    subw a1, a1, a4
+; RV64IM-NEXT:    subw a3, a3, a5
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
@@ -1085,18 +1079,17 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lw s1, 16(a1)
 ; RV32I-NEXT:    lw s2, 20(a1)
 ; RV32I-NEXT:    lw s3, 24(a1)
 ; RV32I-NEXT:    lw s4, 28(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw s6, 12(a1)
-; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -1155,18 +1148,17 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    lw s1, 16(a1)
 ; RV32IM-NEXT:    lw s2, 20(a1)
 ; RV32IM-NEXT:    lw s3, 24(a1)
 ; RV32IM-NEXT:    lw s4, 28(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw a0, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
 ; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw s6, 12(a1)
-; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    li a2, 1
-; RV32IM-NEXT:    mv a0, a3
-; RV32IM-NEXT:    mv a1, a4
+; RV32IM-NEXT:    mv a1, a3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -1220,26 +1212,25 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld a2, 8(a1)
-; RV64I-NEXT:    ld s0, 16(a1)
-; RV64I-NEXT:    ld s1, 24(a1)
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    ld a0, 8(a1)
+; RV64I-NEXT:    ld s1, 16(a1)
+; RV64I-NEXT:    ld s2, 24(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3
-; RV64I-NEXT:    sd zero, 0(s2)
-; RV64I-NEXT:    sd s3, 8(s2)
-; RV64I-NEXT:    sd s0, 16(s2)
-; RV64I-NEXT:    sd a0, 24(s2)
+; RV64I-NEXT:    sd zero, 0(s0)
+; RV64I-NEXT:    sd s3, 8(s0)
+; RV64I-NEXT:    sd s1, 16(s0)
+; RV64I-NEXT:    sd a0, 24(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/stack-slot-size.ll b/llvm/test/CodeGen/RISCV/stack-slot-size.ll
index 71ee6d8160a9d..4691cb6032bcc 100644
--- a/llvm/test/CodeGen/RISCV/stack-slot-size.ll
+++ b/llvm/test/CodeGen/RISCV/stack-slot-size.ll
@@ -21,11 +21,11 @@ define i32 @caller129() nounwind {
 ; RV32I-NEXT:    li a0, 42
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    call callee129
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -38,10 +38,10 @@ define i32 @caller129() nounwind {
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li a0, 42
 ; RV64I-NEXT:    sw a0, 36(sp)
-; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    call callee129
 ; RV64I-NEXT:    lw a0, 36(sp)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -62,11 +62,11 @@ define i32 @caller160() nounwind {
 ; RV32I-NEXT:    li a0, 42
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    call callee160
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -79,10 +79,10 @@ define i32 @caller160() nounwind {
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li a0, 42
 ; RV64I-NEXT:    sw a0, 36(sp)
-; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    call callee160
 ; RV64I-NEXT:    lw a0, 36(sp)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -104,11 +104,11 @@ define i32 @caller161() nounwind {
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    call callee161
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -121,10 +121,10 @@ define i32 @caller161() nounwind {
 ; RV64I-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li a0, 42
 ; RV64I-NEXT:    sw a0, 36(sp)
-; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    mv a0, sp
 ; RV64I-NEXT:    call callee161
 ; RV64I-NEXT:    lw a0, 36(sp)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll
index cd1aebfea5ce4..27fa059ce5429 100644
--- a/llvm/test/CodeGen/RISCV/stack-store-check.ll
+++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll
@@ -29,37 +29,37 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw s10, 656(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    sw s11, 652(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(U)
-; CHECK-NEXT:    lw s9, %lo(U)(a0)
-; CHECK-NEXT:    lw s10, %lo(U+4)(a0)
-; CHECK-NEXT:    lw s11, %lo(U+8)(a0)
-; CHECK-NEXT:    lw s5, %lo(U+12)(a0)
+; CHECK-NEXT:    lw s6, %lo(U)(a0)
+; CHECK-NEXT:    lw s7, %lo(U+4)(a0)
+; CHECK-NEXT:    lw s8, %lo(U+8)(a0)
+; CHECK-NEXT:    lw s0, %lo(U+12)(a0)
 ; CHECK-NEXT:    sw zero, 616(sp)
 ; CHECK-NEXT:    sw zero, 620(sp)
 ; CHECK-NEXT:    sw zero, 624(sp)
 ; CHECK-NEXT:    sw zero, 628(sp)
+; CHECK-NEXT:    sw s6, 600(sp)
+; CHECK-NEXT:    sw s7, 604(sp)
+; CHECK-NEXT:    sw s8, 608(sp)
+; CHECK-NEXT:    sw s0, 612(sp)
 ; CHECK-NEXT:    addi a0, sp, 632
 ; CHECK-NEXT:    addi a1, sp, 616
 ; CHECK-NEXT:    addi a2, sp, 600
-; CHECK-NEXT:    sw s9, 600(sp)
-; CHECK-NEXT:    sw s10, 604(sp)
-; CHECK-NEXT:    sw s11, 608(sp)
-; CHECK-NEXT:    sw s5, 612(sp)
 ; CHECK-NEXT:    call __subtf3
 ; CHECK-NEXT:    lw s1, 632(sp)
 ; CHECK-NEXT:    lw s2, 636(sp)
 ; CHECK-NEXT:    lw s3, 640(sp)
 ; CHECK-NEXT:    lw s4, 644(sp)
-; CHECK-NEXT:    sw s9, 552(sp)
-; CHECK-NEXT:    sw s10, 556(sp)
-; CHECK-NEXT:    sw s11, 560(sp)
-; CHECK-NEXT:    sw s5, 564(sp)
-; CHECK-NEXT:    addi a0, sp, 584
-; CHECK-NEXT:    addi a1, sp, 568
-; CHECK-NEXT:    addi a2, sp, 552
+; CHECK-NEXT:    sw s6, 552(sp)
+; CHECK-NEXT:    sw s7, 556(sp)
+; CHECK-NEXT:    sw s8, 560(sp)
+; CHECK-NEXT:    sw s0, 564(sp)
 ; CHECK-NEXT:    sw s1, 568(sp)
 ; CHECK-NEXT:    sw s2, 572(sp)
 ; CHECK-NEXT:    sw s3, 576(sp)
 ; CHECK-NEXT:    sw s4, 580(sp)
+; CHECK-NEXT:    addi a0, sp, 584
+; CHECK-NEXT:    addi a1, sp, 568
+; CHECK-NEXT:    addi a2, sp, 552
 ; CHECK-NEXT:    call __subtf3
 ; CHECK-NEXT:    lw a0, 584(sp)
 ; CHECK-NEXT:    sw a0, 52(sp) # 4-byte Folded Spill
@@ -73,18 +73,22 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw zero, 508(sp)
 ; CHECK-NEXT:    sw zero, 512(sp)
 ; CHECK-NEXT:    sw zero, 516(sp)
+; CHECK-NEXT:    sw s6, 520(sp)
+; CHECK-NEXT:    sw s7, 524(sp)
+; CHECK-NEXT:    sw s8, 528(sp)
+; CHECK-NEXT:    sw s0, 532(sp)
 ; CHECK-NEXT:    addi a0, sp, 536
 ; CHECK-NEXT:    addi a1, sp, 520
 ; CHECK-NEXT:    addi a2, sp, 504
-; CHECK-NEXT:    sw s9, 520(sp)
-; CHECK-NEXT:    sw s10, 524(sp)
-; CHECK-NEXT:    sw s11, 528(sp)
-; CHECK-NEXT:    sw s5, 532(sp)
 ; CHECK-NEXT:    call __addtf3
-; CHECK-NEXT:    lw s0, 536(sp)
-; CHECK-NEXT:    lw s6, 540(sp)
-; CHECK-NEXT:    lw s7, 544(sp)
-; CHECK-NEXT:    lw s8, 548(sp)
+; CHECK-NEXT:    lw s5, 536(sp)
+; CHECK-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    lw s9, 540(sp)
+; CHECK-NEXT:    sw s9, 32(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    lw s10, 544(sp)
+; CHECK-NEXT:    sw s10, 28(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    lw s11, 548(sp)
+; CHECK-NEXT:    sw s11, 24(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(Y1)
 ; CHECK-NEXT:    lw a1, %lo(Y1)(a0)
 ; CHECK-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
@@ -98,13 +102,13 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw a2, 316(sp)
 ; CHECK-NEXT:    sw a3, 320(sp)
 ; CHECK-NEXT:    sw a0, 324(sp)
-; CHECK-NEXT:    addi a0, sp, 344
-; CHECK-NEXT:    addi a1, sp, 328
-; CHECK-NEXT:    addi a2, sp, 312
 ; CHECK-NEXT:    sw s1, 328(sp)
 ; CHECK-NEXT:    sw s2, 332(sp)
 ; CHECK-NEXT:    sw s3, 336(sp)
 ; CHECK-NEXT:    sw s4, 340(sp)
+; CHECK-NEXT:    addi a0, sp, 344
+; CHECK-NEXT:    addi a1, sp, 328
+; CHECK-NEXT:    addi a2, sp, 312
 ; CHECK-NEXT:    call __multf3
 ; CHECK-NEXT:    lw a0, 344(sp)
 ; CHECK-NEXT:    sw a0, 68(sp) # 4-byte Folded Spill
@@ -114,180 +118,176 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    sw a0, 60(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    lw a0, 356(sp)
 ; CHECK-NEXT:    sw a0, 56(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    sw s9, 472(sp)
-; CHECK-NEXT:    sw s10, 476(sp)
-; CHECK-NEXT:    sw s11, 480(sp)
-; CHECK-NEXT:    sw s5, 484(sp)
+; CHECK-NEXT:    sw s6, 472(sp)
+; CHECK-NEXT:    sw s7, 476(sp)
+; CHECK-NEXT:    sw s8, 480(sp)
+; CHECK-NEXT:    sw s0, 484(sp)
+; CHECK-NEXT:    sw s5, 456(sp)
+; CHECK-NEXT:    sw s9, 460(sp)
+; CHECK-NEXT:    sw s10, 464(sp)
+; CHECK-NEXT:    sw s11, 468(sp)
 ; CHECK-NEXT:    addi a0, sp, 488
 ; CHECK-NEXT:    addi a1, sp, 472
 ; CHECK-NEXT:    addi a2, sp, 456
-; CHECK-NEXT:    sw s0, 456(sp)
-; CHECK-NEXT:    sw s6, 460(sp)
-; CHECK-NEXT:    sw s7, 464(sp)
-; CHECK-NEXT:    sw s8, 468(sp)
 ; CHECK-NEXT:    call __addtf3
-; CHECK-NEXT:    lw a3, 488(sp)
-; CHECK-NEXT:    lw a4, 492(sp)
-; CHECK-NEXT:    lw a5, 496(sp)
-; CHECK-NEXT:    lw a6, 500(sp)
+; CHECK-NEXT:    lw a0, 488(sp)
+; CHECK-NEXT:    lw a1, 492(sp)
+; CHECK-NEXT:    lw a2, 496(sp)
+; CHECK-NEXT:    lw a3, 500(sp)
 ; CHECK-NEXT:    sw zero, 424(sp)
 ; CHECK-NEXT:    sw zero, 428(sp)
 ; CHECK-NEXT:    sw zero, 432(sp)
 ; CHECK-NEXT:    sw zero, 436(sp)
+; CHECK-NEXT:    sw a0, 408(sp)
+; CHECK-NEXT:    sw a1, 412(sp)
+; CHECK-NEXT:    sw a2, 416(sp)
+; CHECK-NEXT:    sw a3, 420(sp)
 ; CHECK-NEXT:    addi a0, sp, 440
 ; CHECK-NEXT:    addi a1, sp, 424
 ; CHECK-NEXT:    addi a2, sp, 408
-; CHECK-NEXT:    sw a3, 408(sp)
-; CHECK-NEXT:    sw a4, 412(sp)
-; CHECK-NEXT:    sw a5, 416(sp)
-; CHECK-NEXT:    sw a6, 420(sp)
 ; CHECK-NEXT:    call __subtf3
-; CHECK-NEXT:    lw a0, 448(sp)
-; CHECK-NEXT:    lw a1, 452(sp)
-; CHECK-NEXT:    lw a2, 440(sp)
-; CHECK-NEXT:    lw a3, 444(sp)
+; CHECK-NEXT:    lw a0, 440(sp)
+; CHECK-NEXT:    lw a1, 444(sp)
+; CHECK-NEXT:    lw a2, 448(sp)
+; CHECK-NEXT:    lw a3, 452(sp)
 ; CHECK-NEXT:    lui a4, %hi(X)
-; CHECK-NEXT:    sw a1, %lo(X+12)(a4)
-; CHECK-NEXT:    sw a0, %lo(X+8)(a4)
-; CHECK-NEXT:    sw a3, %lo(X+4)(a4)
-; CHECK-NEXT:    sw a2, %lo(X)(a4)
-; CHECK-NEXT:    lw s5, 20(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s5, 216(sp)
-; CHECK-NEXT:    lw s9, 16(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s9, 220(sp)
-; CHECK-NEXT:    lw s10, 12(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s10, 224(sp)
-; CHECK-NEXT:    lw s11, 8(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s11, 228(sp)
+; CHECK-NEXT:    sw a3, %lo(X+12)(a4)
+; CHECK-NEXT:    sw a2, %lo(X+8)(a4)
+; CHECK-NEXT:    sw a1, %lo(X+4)(a4)
+; CHECK-NEXT:    sw a0, %lo(X)(a4)
+; CHECK-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s1, 216(sp)
+; CHECK-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s2, 220(sp)
+; CHECK-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s3, 224(sp)
+; CHECK-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s4, 228(sp)
+; CHECK-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s5, 232(sp)
+; CHECK-NEXT:    lw s9, 48(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s9, 236(sp)
+; CHECK-NEXT:    lw s10, 44(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s10, 240(sp)
+; CHECK-NEXT:    lw s11, 40(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw s11, 244(sp)
 ; CHECK-NEXT:    addi a0, sp, 248
 ; CHECK-NEXT:    addi a1, sp, 232
 ; CHECK-NEXT:    addi a2, sp, 216
-; CHECK-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s1, 232(sp)
-; CHECK-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s2, 236(sp)
-; CHECK-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s3, 240(sp)
-; CHECK-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw s4, 244(sp)
 ; CHECK-NEXT:    call __multf3
-; CHECK-NEXT:    lw a0, 248(sp)
-; CHECK-NEXT:    sw a0, 36(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw a0, 252(sp)
-; CHECK-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw a0, 256(sp)
-; CHECK-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lw a0, 260(sp)
-; CHECK-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    lw s0, 248(sp)
+; CHECK-NEXT:    lw s6, 252(sp)
+; CHECK-NEXT:    lw s7, 256(sp)
+; CHECK-NEXT:    lw s8, 260(sp)
 ; CHECK-NEXT:    sw zero, 360(sp)
 ; CHECK-NEXT:    sw zero, 364(sp)
 ; CHECK-NEXT:    sw zero, 368(sp)
 ; CHECK-NEXT:    sw zero, 372(sp)
+; CHECK-NEXT:    lw a0, 36(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 376(sp)
+; CHECK-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 380(sp)
+; CHECK-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 384(sp)
+; CHECK-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 388(sp)
 ; CHECK-NEXT:    addi a0, sp, 392
 ; CHECK-NEXT:    addi a1, sp, 376
 ; CHECK-NEXT:    addi a2, sp, 360
-; CHECK-NEXT:    sw s0, 376(sp)
-; CHECK-NEXT:    sw s6, 380(sp)
-; CHECK-NEXT:    sw s7, 384(sp)
-; CHECK-NEXT:    sw s8, 388(sp)
 ; CHECK-NEXT:    call __multf3
-; CHECK-NEXT:    lw a0, 400(sp)
-; CHECK-NEXT:    lw a1, 404(sp)
-; CHECK-NEXT:    lw a2, 392(sp)
-; CHECK-NEXT:    lw a3, 396(sp)
+; CHECK-NEXT:    lw a0, 392(sp)
+; CHECK-NEXT:    lw a1, 396(sp)
+; CHECK-NEXT:    lw a2, 400(sp)
+; CHECK-NEXT:    lw a3, 404(sp)
 ; CHECK-NEXT:    lui a4, %hi(S)
-; CHECK-NEXT:    sw a1, %lo(S+12)(a4)
-; CHECK-NEXT:    sw a0, %lo(S+8)(a4)
-; CHECK-NEXT:    sw a3, %lo(S+4)(a4)
-; CHECK-NEXT:    sw a2, %lo(S)(a4)
-; CHECK-NEXT:    sw s1, 264(sp)
-; CHECK-NEXT:    sw s2, 268(sp)
-; CHECK-NEXT:    sw s3, 272(sp)
-; CHECK-NEXT:    sw s4, 276(sp)
+; CHECK-NEXT:    sw a3, %lo(S+12)(a4)
+; CHECK-NEXT:    sw a2, %lo(S+8)(a4)
+; CHECK-NEXT:    sw a1, %lo(S+4)(a4)
+; CHECK-NEXT:    sw a0, %lo(S)(a4)
+; CHECK-NEXT:    sw s5, 264(sp)
+; CHECK-NEXT:    sw s9, 268(sp)
+; CHECK-NEXT:    sw s10, 272(sp)
+; CHECK-NEXT:    sw s11, 276(sp)
+; CHECK-NEXT:    lw a0, 68(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 280(sp)
+; CHECK-NEXT:    lw a0, 64(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 284(sp)
+; CHECK-NEXT:    lw a0, 60(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 288(sp)
+; CHECK-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; CHECK-NEXT:    sw a0, 292(sp)
 ; CHECK-NEXT:    addi a0, sp, 296
 ; CHECK-NEXT:    addi a1, sp, 280
 ; CHECK-NEXT:    addi a2, sp, 264
-; CHECK-NEXT:    lw a3, 68(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 280(sp)
-; CHECK-NEXT:    lw a3, 64(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 284(sp)
-; CHECK-NEXT:    lw a3, 60(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 288(sp)
-; CHECK-NEXT:    lw a3, 56(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 292(sp)
 ; CHECK-NEXT:    call __subtf3
-; CHECK-NEXT:    lw a0, 304(sp)
-; CHECK-NEXT:    lw a1, 308(sp)
-; CHECK-NEXT:    lw a2, 296(sp)
-; CHECK-NEXT:    lw a3, 300(sp)
+; CHECK-NEXT:    lw a0, 296(sp)
+; CHECK-NEXT:    lw a1, 300(sp)
+; CHECK-NEXT:    lw a2, 304(sp)
+; CHECK-NEXT:    lw a3, 308(sp)
 ; CHECK-NEXT:    lui a4, %hi(T)
-; CHECK-NEXT:    sw a1, %lo(T+12)(a4)
-; CHECK-NEXT:    sw a0, %lo(T+8)(a4)
-; CHECK-NEXT:    sw a3, %lo(T+4)(a4)
-; CHECK-NEXT:    sw a2, %lo(T)(a4)
+; CHECK-NEXT:    sw a3, %lo(T+12)(a4)
+; CHECK-NEXT:    sw a2, %lo(T+8)(a4)
+; CHECK-NEXT:    sw a1, %lo(T+4)(a4)
+; CHECK-NEXT:    sw a0, %lo(T)(a4)
 ; CHECK-NEXT:    sw zero, 168(sp)
 ; CHECK-NEXT:    sw zero, 172(sp)
 ; CHECK-NEXT:    sw zero, 176(sp)
 ; CHECK-NEXT:    sw zero, 180(sp)
+; CHECK-NEXT:    sw s0, 184(sp)
+; CHECK-NEXT:    sw s6, 188(sp)
+; CHECK-NEXT:    sw s7, 192(sp)
+; CHECK-NEXT:    sw s8, 196(sp)
 ; CHECK-NEXT:    addi a0, sp, 200
 ; CHECK-NEXT:    addi a1, sp, 184
 ; CHECK-NEXT:    addi a2, sp, 168
-; CHECK-NEXT:    lw a3, 36(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 184(sp)
-; CHECK-NEXT:    lw a3, 32(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 188(sp)
-; CHECK-NEXT:    lw a3, 28(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 192(sp)
-; CHECK-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    sw a3, 196(sp)
 ; CHECK-NEXT:    call __addtf3
-; CHECK-NEXT:    lw a0, 208(sp)
-; CHECK-NEXT:    lw a1, 212(sp)
-; CHECK-NEXT:    lw a2, 200(sp)
-; CHECK-NEXT:    lw a3, 204(sp)
+; CHECK-NEXT:    lw a0, 200(sp)
+; CHECK-NEXT:    lw a1, 204(sp)
+; CHECK-NEXT:    lw a2, 208(sp)
+; CHECK-NEXT:    lw a3, 212(sp)
 ; CHECK-NEXT:    lui a4, %hi(Y)
-; CHECK-NEXT:    sw a1, %lo(Y+12)(a4)
-; CHECK-NEXT:    sw a0, %lo(Y+8)(a4)
-; CHECK-NEXT:    sw a3, %lo(Y+4)(a4)
-; CHECK-NEXT:    sw a2, %lo(Y)(a4)
+; CHECK-NEXT:    sw a3, %lo(Y+12)(a4)
+; CHECK-NEXT:    sw a2, %lo(Y+8)(a4)
+; CHECK-NEXT:    sw a1, %lo(Y+4)(a4)
+; CHECK-NEXT:    sw a0, %lo(Y)(a4)
 ; CHECK-NEXT:    sw zero, 120(sp)
 ; CHECK-NEXT:    sw zero, 124(sp)
 ; CHECK-NEXT:    sw zero, 128(sp)
 ; CHECK-NEXT:    sw zero, 132(sp)
+; CHECK-NEXT:    sw s1, 136(sp)
+; CHECK-NEXT:    sw s2, 140(sp)
+; CHECK-NEXT:    sw s3, 144(sp)
+; CHECK-NEXT:    sw s4, 148(sp)
 ; CHECK-NEXT:    addi a0, sp, 152
 ; CHECK-NEXT:    addi a1, sp, 136
 ; CHECK-NEXT:    addi a2, sp, 120
-; CHECK-NEXT:    sw s5, 136(sp)
-; CHECK-NEXT:    sw s9, 140(sp)
-; CHECK-NEXT:    sw s10, 144(sp)
-; CHECK-NEXT:    sw s11, 148(sp)
 ; CHECK-NEXT:    call __multf3
-; CHECK-NEXT:    lw a3, 152(sp)
-; CHECK-NEXT:    lw a4, 156(sp)
-; CHECK-NEXT:    lw a5, 160(sp)
-; CHECK-NEXT:    lw a6, 164(sp)
-; CHECK-NEXT:    lui a2, 786400
+; CHECK-NEXT:    lw a2, 152(sp)
+; CHECK-NEXT:    lw a3, 156(sp)
+; CHECK-NEXT:    lw a4, 160(sp)
+; CHECK-NEXT:    lw a5, 164(sp)
+; CHECK-NEXT:    lui a1, 786400
 ; CHECK-NEXT:    addi a0, sp, 104
-; CHECK-NEXT:    addi a1, sp, 88
 ; CHECK-NEXT:    sw zero, 72(sp)
 ; CHECK-NEXT:    sw zero, 76(sp)
 ; CHECK-NEXT:    sw zero, 80(sp)
-; CHECK-NEXT:    sw a2, 84(sp)
+; CHECK-NEXT:    sw a1, 84(sp)
+; CHECK-NEXT:    addi a1, sp, 88
+; CHECK-NEXT:    sw a2, 88(sp)
+; CHECK-NEXT:    sw a3, 92(sp)
+; CHECK-NEXT:    sw a4, 96(sp)
+; CHECK-NEXT:    sw a5, 100(sp)
 ; CHECK-NEXT:    addi a2, sp, 72
-; CHECK-NEXT:    sw a3, 88(sp)
-; CHECK-NEXT:    sw a4, 92(sp)
-; CHECK-NEXT:    sw a5, 96(sp)
-; CHECK-NEXT:    sw a6, 100(sp)
 ; CHECK-NEXT:    call __addtf3
-; CHECK-NEXT:    lw a0, 112(sp)
-; CHECK-NEXT:    lw a1, 116(sp)
-; CHECK-NEXT:    lw a2, 104(sp)
-; CHECK-NEXT:    lw a3, 108(sp)
+; CHECK-NEXT:    lw a0, 104(sp)
+; CHECK-NEXT:    lw a1, 108(sp)
+; CHECK-NEXT:    lw a2, 112(sp)
+; CHECK-NEXT:    lw a3, 116(sp)
 ; CHECK-NEXT:    lui a4, %hi(Y1)
-; CHECK-NEXT:    sw a0, %lo(Y1+8)(a4)
-; CHECK-NEXT:    sw a1, %lo(Y1+12)(a4)
-; CHECK-NEXT:    sw a2, %lo(Y1)(a4)
-; CHECK-NEXT:    sw a3, %lo(Y1+4)(a4)
+; CHECK-NEXT:    sw a2, %lo(Y1+8)(a4)
+; CHECK-NEXT:    sw a3, %lo(Y1+12)(a4)
+; CHECK-NEXT:    sw a0, %lo(Y1)(a4)
+; CHECK-NEXT:    sw a1, %lo(Y1+4)(a4)
 ; CHECK-NEXT:    lw ra, 700(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    lw s0, 696(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    lw s1, 692(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 366b37ac5d472..a6acb2827acea 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -30,27 +30,23 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
 define void @caller_extern(ptr %src) optsize {
 ; CHECK-LABEL: caller_extern:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(dest)
-; CHECK-NEXT:    addi a1, a1, %lo(dest)
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    lui a0, %hi(dest)
+; CHECK-NEXT:    addi a0, a0, %lo(dest)
 ; CHECK-NEXT:    li a2, 7
-; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
 ;
 ; CHECK-LARGE-ZICFILP-LABEL: caller_extern:
 ; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a0
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi1:
-; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI1_0)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi2:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI1_1)
-; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi1)(a0)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi2)(a2)
 ; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
-; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
-; CHECK-LARGE-ZICFILP-NEXT:    mv a0, a1
-; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a3
 ; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   tail call void @llvm.memcpy.p0.p0.i32(ptr @dest, ptr %src, i32 7, i1 false)
@@ -62,27 +58,23 @@ entry:
 define void @caller_extern_pgso(ptr %src) !prof !14 {
 ; CHECK-LABEL: caller_extern_pgso:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(dest_pgso)
-; CHECK-NEXT:    addi a1, a1, %lo(dest_pgso)
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    lui a0, %hi(dest_pgso)
+; CHECK-NEXT:    addi a0, a0, %lo(dest_pgso)
 ; CHECK-NEXT:    li a2, 7
-; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    mv a1, a3
 ; CHECK-NEXT:    tail memcpy
 ;
 ; CHECK-LARGE-ZICFILP-LABEL: caller_extern_pgso:
 ; CHECK-LARGE-ZICFILP:       # %bb.0: # %entry
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
+; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a0
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi3:
-; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI2_0)
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI2_0)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi4:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI2_1)
-; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi3)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi3)(a0)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi4)(a2)
 ; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
-; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
-; CHECK-LARGE-ZICFILP-NEXT:    mv a0, a1
-; CHECK-LARGE-ZICFILP-NEXT:    mv a1, a3
 ; CHECK-LARGE-ZICFILP-NEXT:    jr t2
 entry:
   tail call void @llvm.memcpy.p0.p0.i32(ptr @dest_pgso, ptr %src, i32 7, i1 false)
@@ -181,10 +173,10 @@ define void @caller_varargs(i32 %a, i32 %b) nounwind {
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
 ; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -16
 ; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 0(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi7:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI5_0)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi7)(a2)
-; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 0(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    mv a2, a1
 ; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
 ; CHECK-LARGE-ZICFILP-NEXT:    mv a4, a0
@@ -231,19 +223,19 @@ define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g
 ; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t0, 32(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t1, 36(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 40(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 44(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 48(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, 40(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t3, 44(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t4, 48(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t5, 52(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 16(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t4, 16(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw t5, 20(sp)
-; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi8:
-; CHECK-LARGE-ZICFILP-NEXT:    auipc t2, %pcrel_hi(.LCPI6_0)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi8)(t2)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw t0, 0(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw t1, 4(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 8(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw t4, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t2, 8(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw t3, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi8:
+; CHECK-LARGE-ZICFILP-NEXT:    auipc t0, %pcrel_hi(.LCPI6_0)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi8)(t0)
 ; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
 ; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
@@ -260,12 +252,12 @@ define void @caller_indirect_args() nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi sp, sp, -32
 ; CHECK-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    lui a1, 262128
-; CHECK-NEXT:    mv a0, sp
+; CHECK-NEXT:    lui a0, 262128
 ; CHECK-NEXT:    sw zero, 0(sp)
 ; CHECK-NEXT:    sw zero, 4(sp)
 ; CHECK-NEXT:    sw zero, 8(sp)
-; CHECK-NEXT:    sw a1, 12(sp)
+; CHECK-NEXT:    sw a0, 12(sp)
+; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    call callee_indirect_args
 ; CHECK-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 32
@@ -276,15 +268,15 @@ define void @caller_indirect_args() nounwind {
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
 ; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, -32
 ; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; CHECK-LARGE-ZICFILP-NEXT:    lui a1, 262128
+; CHECK-LARGE-ZICFILP-NEXT:    lui a0, 262128
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi9:
-; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI7_0)
-; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi9)(a0)
-; CHECK-LARGE-ZICFILP-NEXT:    mv a0, sp
+; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI7_0)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 0(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 4(sp)
 ; CHECK-LARGE-ZICFILP-NEXT:    sw zero, 8(sp)
-; CHECK-LARGE-ZICFILP-NEXT:    sw a1, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    sw a0, 12(sp)
+; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi9)(a1)
+; CHECK-LARGE-ZICFILP-NEXT:    mv a0, sp
 ; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
 ; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; CHECK-LARGE-ZICFILP-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll
index 50da56fbc5951..e28d98bf3047e 100644
--- a/llvm/test/CodeGen/RISCV/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/ucmp.ll
@@ -89,15 +89,15 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a6, 12(a0)
 ; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a7, 8(a0)
-; RV32I-NEXT:    beq a6, a5, .LBB4_2
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a7, 12(a0)
+; RV32I-NEXT:    beq a7, a5, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t2, a6, a5
+; RV32I-NEXT:    sltu t2, a7, a5
 ; RV32I-NEXT:    j .LBB4_3
 ; RV32I-NEXT:  .LBB4_2:
-; RV32I-NEXT:    sltu t2, a7, a4
+; RV32I-NEXT:    sltu t2, a6, a4
 ; RV32I-NEXT:  .LBB4_3:
 ; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    lw t0, 0(a0)
@@ -108,23 +108,23 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; RV32I-NEXT:  .LBB4_5:
 ; RV32I-NEXT:    sltu a0, t0, a1
 ; RV32I-NEXT:  .LBB4_6:
-; RV32I-NEXT:    xor t1, a6, a5
-; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    xor t1, a7, a5
+; RV32I-NEXT:    xor t3, a6, a4
 ; RV32I-NEXT:    or t1, t3, t1
 ; RV32I-NEXT:    beqz t1, .LBB4_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a0, t2
 ; RV32I-NEXT:  .LBB4_8:
-; RV32I-NEXT:    beq a6, a5, .LBB4_11
+; RV32I-NEXT:    beq a7, a5, .LBB4_11
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    sltu a4, a5, a7
 ; RV32I-NEXT:    bne a3, a2, .LBB4_12
 ; RV32I-NEXT:  .LBB4_10:
 ; RV32I-NEXT:    sltu a1, a1, t0
 ; RV32I-NEXT:    bnez t1, .LBB4_13
 ; RV32I-NEXT:    j .LBB4_14
 ; RV32I-NEXT:  .LBB4_11:
-; RV32I-NEXT:    sltu a4, a4, a7
+; RV32I-NEXT:    sltu a4, a4, a6
 ; RV32I-NEXT:    beq a3, a2, .LBB4_10
 ; RV32I-NEXT:  .LBB4_12:
 ; RV32I-NEXT:    sltu a1, a2, a3
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 1cdfaa5c4154b..01a8a66f53f15 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -26,10 +26,10 @@ define i8 @load_i8(ptr %p) {
 define i16 @load_i16(ptr %p) {
 ; SLOW-LABEL: load_i16:
 ; SLOW:       # %bb.0:
-; SLOW-NEXT:    lbu a1, 1(a0)
-; SLOW-NEXT:    lbu a0, 0(a0)
-; SLOW-NEXT:    slli a1, a1, 8
-; SLOW-NEXT:    or a0, a1, a0
+; SLOW-NEXT:    lbu a1, 0(a0)
+; SLOW-NEXT:    lbu a0, 1(a0)
+; SLOW-NEXT:    slli a0, a0, 8
+; SLOW-NEXT:    or a0, a0, a1
 ; SLOW-NEXT:    ret
 ;
 ; FAST-LABEL: load_i16:
@@ -43,11 +43,11 @@ define i16 @load_i16(ptr %p) {
 define i24 @load_i24(ptr %p) {
 ; SLOWBASE-LABEL: load_i24:
 ; SLOWBASE:       # %bb.0:
-; SLOWBASE-NEXT:    lbu a1, 1(a0)
-; SLOWBASE-NEXT:    lbu a2, 0(a0)
+; SLOWBASE-NEXT:    lbu a1, 0(a0)
+; SLOWBASE-NEXT:    lbu a2, 1(a0)
 ; SLOWBASE-NEXT:    lbu a0, 2(a0)
-; SLOWBASE-NEXT:    slli a1, a1, 8
-; SLOWBASE-NEXT:    or a1, a1, a2
+; SLOWBASE-NEXT:    slli a2, a2, 8
+; SLOWBASE-NEXT:    or a1, a2, a1
 ; SLOWBASE-NEXT:    slli a0, a0, 16
 ; SLOWBASE-NEXT:    or a0, a1, a0
 ; SLOWBASE-NEXT:    ret
@@ -73,10 +73,10 @@ define i24 @load_i24(ptr %p) {
 ;
 ; FAST-LABEL: load_i24:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    lbu a1, 2(a0)
-; FAST-NEXT:    lhu a0, 0(a0)
-; FAST-NEXT:    slli a1, a1, 16
-; FAST-NEXT:    or a0, a0, a1
+; FAST-NEXT:    lhu a1, 0(a0)
+; FAST-NEXT:    lbu a0, 2(a0)
+; FAST-NEXT:    slli a0, a0, 16
+; FAST-NEXT:    or a0, a1, a0
 ; FAST-NEXT:    ret
   %res = load i24, ptr %p, align 1
   ret i24 %res
@@ -85,12 +85,12 @@ define i24 @load_i24(ptr %p) {
 define i32 @load_i32(ptr %p) {
 ; SLOWBASE-LABEL: load_i32:
 ; SLOWBASE:       # %bb.0:
-; SLOWBASE-NEXT:    lbu a1, 1(a0)
-; SLOWBASE-NEXT:    lbu a2, 0(a0)
+; SLOWBASE-NEXT:    lbu a1, 0(a0)
+; SLOWBASE-NEXT:    lbu a2, 1(a0)
 ; SLOWBASE-NEXT:    lbu a3, 2(a0)
 ; SLOWBASE-NEXT:    lbu a0, 3(a0)
-; SLOWBASE-NEXT:    slli a1, a1, 8
-; SLOWBASE-NEXT:    or a1, a1, a2
+; SLOWBASE-NEXT:    slli a2, a2, 8
+; SLOWBASE-NEXT:    or a1, a2, a1
 ; SLOWBASE-NEXT:    slli a3, a3, 16
 ; SLOWBASE-NEXT:    slli a0, a0, 24
 ; SLOWBASE-NEXT:    or a0, a0, a3
@@ -99,13 +99,13 @@ define i32 @load_i32(ptr %p) {
 ;
 ; RV32IZBKB-LABEL: load_i32:
 ; RV32IZBKB:       # %bb.0:
-; RV32IZBKB-NEXT:    lbu a1, 1(a0)
-; RV32IZBKB-NEXT:    lbu a2, 2(a0)
-; RV32IZBKB-NEXT:    lbu a3, 3(a0)
-; RV32IZBKB-NEXT:    lbu a0, 0(a0)
-; RV32IZBKB-NEXT:    packh a2, a2, a3
-; RV32IZBKB-NEXT:    packh a0, a0, a1
-; RV32IZBKB-NEXT:    pack a0, a0, a2
+; RV32IZBKB-NEXT:    lbu a1, 0(a0)
+; RV32IZBKB-NEXT:    lbu a2, 1(a0)
+; RV32IZBKB-NEXT:    lbu a3, 2(a0)
+; RV32IZBKB-NEXT:    lbu a0, 3(a0)
+; RV32IZBKB-NEXT:    packh a0, a3, a0
+; RV32IZBKB-NEXT:    packh a1, a1, a2
+; RV32IZBKB-NEXT:    pack a0, a1, a0
 ; RV32IZBKB-NEXT:    ret
 ;
 ; RV64IZBKB-LABEL: load_i32:
@@ -132,50 +132,50 @@ define i32 @load_i32(ptr %p) {
 define i64 @load_i64(ptr %p) {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 2(a0)
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    slli a2, a2, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 5(a0)
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    lbu a3, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a2, 1(a0)
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    lbu a4, 3(a0)
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a4, a4, 24
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a2, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a2, a4, a2
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a0, a3
-; RV32I-NEXT:    or a0, a2, a1
-; RV32I-NEXT:    or a1, a3, a4
+; RV32I-NEXT:    or a5, a0, a5
+; RV32I-NEXT:    or a0, a3, a1
+; RV32I-NEXT:    or a1, a5, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: load_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 2(a0)
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    slli a2, a2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    lbu a3, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a2, 1(a0)
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lbu a4, 3(a0)
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a2, 4(a0)
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -186,16 +186,16 @@ define i64 @load_i64(ptr %p) {
 ; RV32IZBKB-NEXT:    lbu a2, 1(a0)
 ; RV32IZBKB-NEXT:    lbu a3, 2(a0)
 ; RV32IZBKB-NEXT:    lbu a4, 3(a0)
-; RV32IZBKB-NEXT:    lbu a5, 5(a0)
-; RV32IZBKB-NEXT:    lbu a6, 6(a0)
-; RV32IZBKB-NEXT:    lbu a7, 7(a0)
-; RV32IZBKB-NEXT:    lbu a0, 4(a0)
 ; RV32IZBKB-NEXT:    packh a3, a3, a4
 ; RV32IZBKB-NEXT:    packh a1, a1, a2
-; RV32IZBKB-NEXT:    packh a2, a6, a7
-; RV32IZBKB-NEXT:    packh a4, a0, a5
+; RV32IZBKB-NEXT:    lbu a2, 4(a0)
+; RV32IZBKB-NEXT:    lbu a4, 5(a0)
+; RV32IZBKB-NEXT:    lbu a5, 6(a0)
+; RV32IZBKB-NEXT:    lbu a0, 7(a0)
+; RV32IZBKB-NEXT:    packh a5, a5, a0
+; RV32IZBKB-NEXT:    packh a2, a2, a4
 ; RV32IZBKB-NEXT:    pack a0, a1, a3
-; RV32IZBKB-NEXT:    pack a1, a4, a2
+; RV32IZBKB-NEXT:    pack a1, a2, a5
 ; RV32IZBKB-NEXT:    ret
 ;
 ; RV64IZBKB-LABEL: load_i64:
@@ -204,14 +204,14 @@ define i64 @load_i64(ptr %p) {
 ; RV64IZBKB-NEXT:    lbu a2, 5(a0)
 ; RV64IZBKB-NEXT:    lbu a3, 6(a0)
 ; RV64IZBKB-NEXT:    lbu a4, 7(a0)
-; RV64IZBKB-NEXT:    lbu a5, 0(a0)
-; RV64IZBKB-NEXT:    lbu a6, 1(a0)
-; RV64IZBKB-NEXT:    lbu a7, 2(a0)
-; RV64IZBKB-NEXT:    lbu a0, 3(a0)
 ; RV64IZBKB-NEXT:    packh a1, a1, a2
 ; RV64IZBKB-NEXT:    packh a2, a3, a4
-; RV64IZBKB-NEXT:    packh a3, a5, a6
-; RV64IZBKB-NEXT:    packh a0, a7, a0
+; RV64IZBKB-NEXT:    lbu a3, 0(a0)
+; RV64IZBKB-NEXT:    lbu a4, 1(a0)
+; RV64IZBKB-NEXT:    lbu a5, 2(a0)
+; RV64IZBKB-NEXT:    lbu a0, 3(a0)
+; RV64IZBKB-NEXT:    packh a3, a3, a4
+; RV64IZBKB-NEXT:    packh a0, a5, a0
 ; RV64IZBKB-NEXT:    slli a2, a2, 16
 ; RV64IZBKB-NEXT:    slli a0, a0, 16
 ; RV64IZBKB-NEXT:    or a1, a2, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index c73a18c8869d5..106acff8fab95 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -331,13 +331,13 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a0, 4(a0)
-; RV32-NEXT:    lw a1, 0(s0)
-; RV32-NEXT:    slli a0, a0, 10
-; RV32-NEXT:    srli s1, a1, 22
-; RV32-NEXT:    or s1, s1, a0
-; RV32-NEXT:    srli s2, a1, 11
-; RV32-NEXT:    andi a0, a1, 2047
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    lbu a1, 4(s0)
+; RV32-NEXT:    slli a1, a1, 10
+; RV32-NEXT:    srli s1, a0, 22
+; RV32-NEXT:    or s1, s1, a1
+; RV32-NEXT:    srli s2, a0, 11
+; RV32-NEXT:    andi a0, a0, 2047
 ; RV32-NEXT:    li a1, 683
 ; RV32-NEXT:    call __mulsi3
 ; RV32-NEXT:    slli a1, a0, 10
@@ -388,10 +388,10 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    lbu a0, 4(a0)
-; RV64-NEXT:    lwu a1, 0(s0)
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    lwu a0, 0(a0)
+; RV64-NEXT:    lbu a1, 4(s0)
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    srli s1, a0, 22
 ; RV64-NEXT:    srli s2, a0, 11
 ; RV64-NEXT:    andi a0, a0, 2047
@@ -438,40 +438,40 @@ define void @test_urem_vec(ptr %X) nounwind {
 ;
 ; RV32M-LABEL: test_urem_vec:
 ; RV32M:       # %bb.0:
-; RV32M-NEXT:    lbu a1, 4(a0)
-; RV32M-NEXT:    lw a2, 0(a0)
+; RV32M-NEXT:    lw a1, 0(a0)
+; RV32M-NEXT:    lbu a2, 4(a0)
 ; RV32M-NEXT:    li a3, 683
 ; RV32M-NEXT:    li a4, 819
-; RV32M-NEXT:    slli a1, a1, 10
-; RV32M-NEXT:    srli a5, a2, 22
-; RV32M-NEXT:    or a1, a5, a1
-; RV32M-NEXT:    andi a5, a2, 2047
+; RV32M-NEXT:    slli a2, a2, 10
+; RV32M-NEXT:    srli a5, a1, 22
+; RV32M-NEXT:    or a2, a5, a2
+; RV32M-NEXT:    andi a5, a1, 2047
 ; RV32M-NEXT:    mul a3, a5, a3
 ; RV32M-NEXT:    li a5, 1463
-; RV32M-NEXT:    srli a2, a2, 11
-; RV32M-NEXT:    mul a2, a2, a5
+; RV32M-NEXT:    srli a1, a1, 11
+; RV32M-NEXT:    mul a1, a1, a5
 ; RV32M-NEXT:    slli a5, a3, 10
 ; RV32M-NEXT:    slli a3, a3, 21
-; RV32M-NEXT:    mul a1, a1, a4
-; RV32M-NEXT:    addi a2, a2, -1463
+; RV32M-NEXT:    mul a2, a2, a4
+; RV32M-NEXT:    addi a1, a1, -1463
 ; RV32M-NEXT:    srli a3, a3, 22
-; RV32M-NEXT:    addi a1, a1, -1638
-; RV32M-NEXT:    andi a2, a2, 2047
-; RV32M-NEXT:    or a3, a3, a5
+; RV32M-NEXT:    addi a2, a2, -1638
 ; RV32M-NEXT:    andi a1, a1, 2047
-; RV32M-NEXT:    sltiu a2, a2, 293
+; RV32M-NEXT:    or a3, a3, a5
+; RV32M-NEXT:    andi a2, a2, 2047
+; RV32M-NEXT:    sltiu a1, a1, 293
 ; RV32M-NEXT:    andi a3, a3, 2047
-; RV32M-NEXT:    sltiu a1, a1, 2
-; RV32M-NEXT:    addi a2, a2, -1
-; RV32M-NEXT:    sltiu a3, a3, 342
-; RV32M-NEXT:    xori a4, a1, 1
+; RV32M-NEXT:    sltiu a2, a2, 2
 ; RV32M-NEXT:    addi a1, a1, -1
-; RV32M-NEXT:    andi a2, a2, 2047
+; RV32M-NEXT:    sltiu a3, a3, 342
+; RV32M-NEXT:    xori a4, a2, 1
+; RV32M-NEXT:    addi a2, a2, -1
+; RV32M-NEXT:    andi a1, a1, 2047
 ; RV32M-NEXT:    addi a3, a3, -1
-; RV32M-NEXT:    slli a2, a2, 11
-; RV32M-NEXT:    slli a1, a1, 22
+; RV32M-NEXT:    slli a1, a1, 11
+; RV32M-NEXT:    slli a2, a2, 22
 ; RV32M-NEXT:    andi a3, a3, 2047
-; RV32M-NEXT:    or a1, a2, a1
+; RV32M-NEXT:    or a1, a1, a2
 ; RV32M-NEXT:    or a1, a3, a1
 ; RV32M-NEXT:    sw a1, 0(a0)
 ; RV32M-NEXT:    sb a4, 4(a0)
@@ -479,12 +479,12 @@ define void @test_urem_vec(ptr %X) nounwind {
 ;
 ; RV64M-LABEL: test_urem_vec:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    lbu a1, 4(a0)
-; RV64M-NEXT:    lwu a2, 0(a0)
+; RV64M-NEXT:    lwu a1, 0(a0)
+; RV64M-NEXT:    lbu a2, 4(a0)
 ; RV64M-NEXT:    li a3, 683
 ; RV64M-NEXT:    li a4, 1463
-; RV64M-NEXT:    slli a1, a1, 32
-; RV64M-NEXT:    or a1, a2, a1
+; RV64M-NEXT:    slli a2, a2, 32
+; RV64M-NEXT:    or a1, a1, a2
 ; RV64M-NEXT:    andi a2, a1, 2047
 ; RV64M-NEXT:    mul a2, a2, a3
 ; RV64M-NEXT:    srli a3, a1, 11
@@ -538,15 +538,9 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    srli a1, a1, 21
 ; RV32MV-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32MV-NEXT:    li a1, 2047
-; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32MV-NEXT:    vmv.v.i v11, 1
+; RV32MV-NEXT:    addi a3, a3, -1527
 ; RV32MV-NEXT:    andi a2, a2, 2047
-; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32MV-NEXT:    vslide1down.vx v10, v10, a2
-; RV32MV-NEXT:    lui a2, %hi(.LCPI4_1)
-; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_1)
-; RV32MV-NEXT:    addi a3, a3, -1527
-; RV32MV-NEXT:    vsext.vf2 v12, v11
 ; RV32MV-NEXT:    vslidedown.vi v10, v10, 1
 ; RV32MV-NEXT:    vsub.vv v8, v10, v8
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -556,14 +550,20 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    vmul.vv v8, v8, v9
 ; RV32MV-NEXT:    vadd.vv v9, v8, v8
 ; RV32MV-NEXT:    vsll.vv v9, v9, v11
-; RV32MV-NEXT:    vle16.v v10, (a2)
+; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32MV-NEXT:    vmv.v.i v10, 1
+; RV32MV-NEXT:    lui a2, %hi(.LCPI4_1)
+; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_1)
+; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32MV-NEXT:    vsext.vf2 v11, v10
 ; RV32MV-NEXT:    vand.vx v8, v8, a1
-; RV32MV-NEXT:    vsrl.vv v8, v8, v12
+; RV32MV-NEXT:    vsrl.vv v8, v8, v11
+; RV32MV-NEXT:    vmv.v.i v10, 0
 ; RV32MV-NEXT:    vor.vv v8, v8, v9
+; RV32MV-NEXT:    vle16.v v9, (a2)
 ; RV32MV-NEXT:    vand.vx v8, v8, a1
-; RV32MV-NEXT:    vmsltu.vv v0, v10, v8
-; RV32MV-NEXT:    vmv.v.i v8, 0
-; RV32MV-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32MV-NEXT:    vmsltu.vv v0, v9, v8
+; RV32MV-NEXT:    vmerge.vim v8, v10, -1, v0
 ; RV32MV-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32MV-NEXT:    vmv.x.s a1, v8
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 1
@@ -599,15 +599,9 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    srli a2, a2, 53
 ; RV64MV-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64MV-NEXT:    li a2, 2047
-; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64MV-NEXT:    vmv.v.i v11, 1
+; RV64MV-NEXT:    addi a3, a3, -1527
 ; RV64MV-NEXT:    srli a1, a1, 22
-; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64MV-NEXT:    vslide1down.vx v10, v10, a1
-; RV64MV-NEXT:    lui a1, %hi(.LCPI4_1)
-; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_1)
-; RV64MV-NEXT:    addi a3, a3, -1527
-; RV64MV-NEXT:    vsext.vf2 v12, v11
 ; RV64MV-NEXT:    vslidedown.vi v10, v10, 1
 ; RV64MV-NEXT:    vsub.vv v8, v10, v8
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -617,14 +611,20 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    vmul.vv v8, v8, v9
 ; RV64MV-NEXT:    vadd.vv v9, v8, v8
 ; RV64MV-NEXT:    vsll.vv v9, v9, v11
-; RV64MV-NEXT:    vle16.v v10, (a1)
+; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64MV-NEXT:    vmv.v.i v10, 1
+; RV64MV-NEXT:    lui a1, %hi(.LCPI4_1)
+; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_1)
+; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64MV-NEXT:    vsext.vf2 v11, v10
 ; RV64MV-NEXT:    vand.vx v8, v8, a2
-; RV64MV-NEXT:    vsrl.vv v8, v8, v12
+; RV64MV-NEXT:    vsrl.vv v8, v8, v11
+; RV64MV-NEXT:    vmv.v.i v10, 0
 ; RV64MV-NEXT:    vor.vv v8, v8, v9
+; RV64MV-NEXT:    vle16.v v9, (a1)
 ; RV64MV-NEXT:    vand.vx v8, v8, a2
-; RV64MV-NEXT:    vmsltu.vv v0, v10, v8
-; RV64MV-NEXT:    vmv.v.i v8, 0
-; RV64MV-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64MV-NEXT:    vmsltu.vv v0, v9, v8
+; RV64MV-NEXT:    vmerge.vim v8, v10, -1, v0
 ; RV64MV-NEXT:    vmv.x.s a1, v8
 ; RV64MV-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 2
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 988856ca70923..c9d9ed13faa08 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -19,30 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu a2, 0(a1)
-; RV32I-NEXT:    lhu s0, 4(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a0, 0(a1)
+; RV32I-NEXT:    lhu s1, 4(a1)
+; RV32I-NEXT:    lhu s2, 8(a1)
+; RV32I-NEXT:    lhu s3, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 124
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    li a1, 1003
+; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    sh s4, 0(s3)
-; RV32I-NEXT:    sh s0, 2(s3)
-; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh a0, 6(s3)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    li a1, 1003
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call __umodsi3
+; RV32I-NEXT:    sh s4, 0(s0)
+; RV32I-NEXT:    sh s1, 2(s0)
+; RV32I-NEXT:    sh s2, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -98,30 +97,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu a2, 0(a1)
-; RV64I-NEXT:    lhu s0, 8(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a0, 0(a1)
+; RV64I-NEXT:    lhu s1, 8(a1)
+; RV64I-NEXT:    lhu s2, 16(a1)
+; RV64I-NEXT:    lhu s3, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 124
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    li a1, 1003
+; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    sh s4, 0(s3)
-; RV64I-NEXT:    sh s0, 2(s3)
-; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh a0, 6(s3)
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    li a1, 1003
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call __umoddi3
+; RV64I-NEXT:    sh s4, 0(s0)
+; RV64I-NEXT:    sh s1, 2(s0)
+; RV64I-NEXT:    sh s2, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -140,18 +138,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI0_0)
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    lui t0, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    li t1, 98
-; RV64IM-NEXT:    ld t0, %lo(.LCPI0_2)(t0)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    mulhu a6, a2, a6
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    lui a7, %hi(.LCPI0_3)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI0_0)(a5)
-; RV64IM-NEXT:    ld a7, %lo(.LCPI0_3)(a7)
+; RV64IM-NEXT:    ld t0, %lo(.LCPI0_2)(t0)
 ; RV64IM-NEXT:    mulhu t0, a4, t0
 ; RV64IM-NEXT:    mul t0, t0, t1
 ; RV64IM-NEXT:    li t1, 1003
+; RV64IM-NEXT:    ld a5, %lo(.LCPI0_0)(a5)
+; RV64IM-NEXT:    ld a7, %lo(.LCPI0_3)(a7)
 ; RV64IM-NEXT:    mulhu a5, a3, a5
 ; RV64IM-NEXT:    mulhu a7, a1, a7
 ; RV64IM-NEXT:    mul a7, a7, t1
@@ -181,30 +179,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu a2, 0(a1)
-; RV32I-NEXT:    lhu s0, 4(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 12(a1)
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a0, 0(a1)
+; RV32I-NEXT:    lhu s1, 4(a1)
+; RV32I-NEXT:    lhu s2, 8(a1)
+; RV32I-NEXT:    lhu s3, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    sh s4, 0(s3)
-; RV32I-NEXT:    sh s0, 2(s3)
-; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh a0, 6(s3)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    li a1, 95
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    call __umodsi3
+; RV32I-NEXT:    sh s4, 0(s0)
+; RV32I-NEXT:    sh s1, 2(s0)
+; RV32I-NEXT:    sh s2, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -250,30 +247,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu a2, 0(a1)
-; RV64I-NEXT:    lhu s0, 8(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 24(a1)
-; RV64I-NEXT:    mv s3, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a0, 0(a1)
+; RV64I-NEXT:    lhu s1, 8(a1)
+; RV64I-NEXT:    lhu s2, 16(a1)
+; RV64I-NEXT:    lhu s3, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    sh s4, 0(s3)
-; RV64I-NEXT:    sh s0, 2(s3)
-; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh a0, 6(s3)
+; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    li a1, 95
+; RV64I-NEXT:    mv a0, s3
+; RV64I-NEXT:    call __umoddi3
+; RV64I-NEXT:    sh s4, 0(s0)
+; RV64I-NEXT:    sh s1, 2(s0)
+; RV64I-NEXT:    sh s2, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -285,28 +281,28 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
-; RV64IM-NEXT:    lhu a3, 0(a1)
-; RV64IM-NEXT:    lhu a4, 8(a1)
-; RV64IM-NEXT:    lhu a5, 16(a1)
+; RV64IM-NEXT:    lhu a2, 0(a1)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI1_0)
 ; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    mulhu a7, a3, a2
-; RV64IM-NEXT:    mulhu t0, a4, a2
-; RV64IM-NEXT:    mulhu t1, a5, a2
-; RV64IM-NEXT:    mulhu a2, a1, a2
+; RV64IM-NEXT:    ld a5, %lo(.LCPI1_0)(a5)
+; RV64IM-NEXT:    mulhu a7, a2, a5
+; RV64IM-NEXT:    mulhu t0, a3, a5
+; RV64IM-NEXT:    mulhu t1, a4, a5
+; RV64IM-NEXT:    mulhu a5, a1, a5
 ; RV64IM-NEXT:    mul a7, a7, a6
 ; RV64IM-NEXT:    mul t0, t0, a6
 ; RV64IM-NEXT:    mul t1, t1, a6
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, t0
-; RV64IM-NEXT:    subw a5, a5, t1
-; RV64IM-NEXT:    subw a1, a1, a2
-; RV64IM-NEXT:    sh a3, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
-; RV64IM-NEXT:    sh a5, 4(a0)
+; RV64IM-NEXT:    mul a5, a5, a6
+; RV64IM-NEXT:    subw a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, t0
+; RV64IM-NEXT:    subw a4, a4, t1
+; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -329,11 +325,11 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lhu s1, 0(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu s3, 8(a1)
 ; RV32I-NEXT:    lhu s4, 12(a1)
-; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s4
 ; RV32I-NEXT:    call __umodsi3
@@ -430,11 +426,11 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s6, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s7, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s8, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lhu s1, 0(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu s3, 16(a1)
 ; RV64I-NEXT:    lhu s4, 24(a1)
-; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s4
 ; RV64I-NEXT:    call __umoddi3
@@ -489,33 +485,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: combine_urem_udiv:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 16(a1)
-; RV64IM-NEXT:    lhu a3, 24(a1)
-; RV64IM-NEXT:    lui a4, %hi(.LCPI2_0)
-; RV64IM-NEXT:    ld a4, %lo(.LCPI2_0)(a4)
-; RV64IM-NEXT:    lhu a5, 0(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
+; RV64IM-NEXT:    lhu a2, 0(a1)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI2_0)
 ; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    mulhu a7, a3, a4
-; RV64IM-NEXT:    mulhu t0, a2, a4
-; RV64IM-NEXT:    mulhu t1, a1, a4
-; RV64IM-NEXT:    mulhu a4, a5, a4
+; RV64IM-NEXT:    ld a5, %lo(.LCPI2_0)(a5)
+; RV64IM-NEXT:    mulhu a7, a1, a5
+; RV64IM-NEXT:    mulhu t0, a4, a5
+; RV64IM-NEXT:    mulhu t1, a3, a5
+; RV64IM-NEXT:    mulhu a5, a2, a5
 ; RV64IM-NEXT:    mul t2, a7, a6
 ; RV64IM-NEXT:    mul t3, t0, a6
 ; RV64IM-NEXT:    mul t4, t1, a6
-; RV64IM-NEXT:    mul a6, a4, a6
-; RV64IM-NEXT:    add a4, a5, a4
-; RV64IM-NEXT:    add a1, a1, t1
-; RV64IM-NEXT:    add a2, a2, t0
-; RV64IM-NEXT:    add a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a6
-; RV64IM-NEXT:    subw a1, a1, t4
-; RV64IM-NEXT:    subw a2, a2, t3
-; RV64IM-NEXT:    subw a3, a3, t2
-; RV64IM-NEXT:    sh a4, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
-; RV64IM-NEXT:    sh a3, 6(a0)
+; RV64IM-NEXT:    mul a6, a5, a6
+; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    add a3, a3, t1
+; RV64IM-NEXT:    add a4, a4, t0
+; RV64IM-NEXT:    add a1, a1, a7
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, t4
+; RV64IM-NEXT:    subw a4, a4, t3
+; RV64IM-NEXT:    subw a1, a1, t2
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -533,13 +529,12 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lhu s1, 0(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu s3, 8(a1)
-; RV32I-NEXT:    lhu a2, 12(a1)
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a0, 12(a1)
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    andi a1, s1, 63
 ; RV32I-NEXT:    andi a2, s2, 31
@@ -585,13 +580,12 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lhu s1, 0(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu s3, 16(a1)
-; RV64I-NEXT:    lhu a2, 24(a1)
-; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a0, 24(a1)
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    andi a1, s1, 63
 ; RV64I-NEXT:    andi a2, s2, 31
@@ -642,26 +636,25 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu a2, 4(a1)
-; RV32I-NEXT:    lhu s0, 8(a1)
-; RV32I-NEXT:    lhu s1, 12(a1)
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lhu a0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    li a1, 654
-; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3
-; RV32I-NEXT:    sh zero, 0(s2)
-; RV32I-NEXT:    sh s3, 2(s2)
-; RV32I-NEXT:    sh s0, 4(s2)
-; RV32I-NEXT:    sh a0, 6(s2)
+; RV32I-NEXT:    sh zero, 0(s0)
+; RV32I-NEXT:    sh s3, 2(s0)
+; RV32I-NEXT:    sh s1, 4(s0)
+; RV32I-NEXT:    sh a0, 6(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -708,26 +701,25 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu a2, 8(a1)
-; RV64I-NEXT:    lhu s0, 16(a1)
-; RV64I-NEXT:    lhu s1, 24(a1)
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lhu a0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    sh zero, 0(s2)
-; RV64I-NEXT:    sh s3, 2(s2)
-; RV64I-NEXT:    sh s0, 4(s2)
-; RV64I-NEXT:    sh a0, 6(s2)
+; RV64I-NEXT:    sh zero, 0(s0)
+; RV64I-NEXT:    sh s3, 2(s0)
+; RV64I-NEXT:    sh s1, 4(s0)
+; RV64I-NEXT:    sh a0, 6(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -743,17 +735,17 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a1, 24(a1)
 ; RV64IM-NEXT:    lui a4, %hi(.LCPI4_0)
 ; RV64IM-NEXT:    li a5, 654
-; RV64IM-NEXT:    ld a4, %lo(.LCPI4_0)(a4)
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    li a7, 23
-; RV64IM-NEXT:    ld a6, %lo(.LCPI4_1)(a6)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI4_0)(a4)
 ; RV64IM-NEXT:    mulhu a4, a2, a4
 ; RV64IM-NEXT:    mul a4, a4, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI4_1)(a6)
 ; RV64IM-NEXT:    mulhu a6, a3, a6
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    lui a7, 1
+; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    addi a7, a7, 1327
 ; RV64IM-NEXT:    mulhu a5, a1, a5
 ; RV64IM-NEXT:    mul a5, a5, a7
@@ -793,18 +785,17 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lw s1, 16(a1)
 ; RV32I-NEXT:    lw s2, 20(a1)
 ; RV32I-NEXT:    lw s3, 24(a1)
 ; RV32I-NEXT:    lw s4, 28(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw s6, 12(a1)
-; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    mv a0, a3
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3
 ; RV32I-NEXT:    mv s7, a0
@@ -863,18 +854,17 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    lw s1, 16(a1)
 ; RV32IM-NEXT:    lw s2, 20(a1)
 ; RV32IM-NEXT:    lw s3, 24(a1)
 ; RV32IM-NEXT:    lw s4, 28(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw a0, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
 ; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw s6, 12(a1)
-; RV32IM-NEXT:    mv s0, a0
 ; RV32IM-NEXT:    li a2, 1
-; RV32IM-NEXT:    mv a0, a3
-; RV32IM-NEXT:    mv a1, a4
+; RV32IM-NEXT:    mv a1, a3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3
 ; RV32IM-NEXT:    mv s7, a0
@@ -928,26 +918,25 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld a2, 8(a1)
-; RV64I-NEXT:    ld s0, 16(a1)
-; RV64I-NEXT:    ld s1, 24(a1)
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    ld a0, 8(a1)
+; RV64I-NEXT:    ld s1, 16(a1)
+; RV64I-NEXT:    ld s2, 24(a1)
 ; RV64I-NEXT:    li a1, 654
-; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3
-; RV64I-NEXT:    sd zero, 0(s2)
-; RV64I-NEXT:    sd s3, 8(s2)
-; RV64I-NEXT:    sd s0, 16(s2)
-; RV64I-NEXT:    sd a0, 24(s2)
+; RV64I-NEXT:    sd zero, 0(s0)
+; RV64I-NEXT:    sd s3, 8(s0)
+; RV64I-NEXT:    sd s1, 16(s0)
+; RV64I-NEXT:    sd a0, 24(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -961,31 +950,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64IM-NEXT:    ld a2, 8(a1)
 ; RV64IM-NEXT:    ld a3, 16(a1)
 ; RV64IM-NEXT:    ld a1, 24(a1)
-; RV64IM-NEXT:    lui a4, %hi(.LCPI6_1)
-; RV64IM-NEXT:    ld a4, %lo(.LCPI6_1)(a4)
-; RV64IM-NEXT:    lui a5, %hi(.LCPI6_0)
+; RV64IM-NEXT:    lui a4, %hi(.LCPI6_0)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    li a6, 654
+; RV64IM-NEXT:    ld a5, %lo(.LCPI6_1)(a5)
 ; RV64IM-NEXT:    srli a7, a2, 1
-; RV64IM-NEXT:    mulhu a4, a7, a4
+; RV64IM-NEXT:    mulhu a5, a7, a5
 ; RV64IM-NEXT:    lui a7, %hi(.LCPI6_2)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI6_0)(a5)
-; RV64IM-NEXT:    ld a7, %lo(.LCPI6_2)(a7)
-; RV64IM-NEXT:    srli a4, a4, 7
-; RV64IM-NEXT:    mul a4, a4, a6
+; RV64IM-NEXT:    srli a5, a5, 7
+; RV64IM-NEXT:    mul a5, a5, a6
 ; RV64IM-NEXT:    lui a6, 1
+; RV64IM-NEXT:    ld a4, %lo(.LCPI6_0)(a4)
+; RV64IM-NEXT:    ld a7, %lo(.LCPI6_2)(a7)
 ; RV64IM-NEXT:    addiw a6, a6, 1327
-; RV64IM-NEXT:    mulhu a5, a3, a5
+; RV64IM-NEXT:    mulhu a4, a3, a4
 ; RV64IM-NEXT:    mulhu a7, a1, a7
 ; RV64IM-NEXT:    srli a7, a7, 12
 ; RV64IM-NEXT:    mul a6, a7, a6
-; RV64IM-NEXT:    sub a7, a3, a5
+; RV64IM-NEXT:    sub a7, a3, a4
 ; RV64IM-NEXT:    srli a7, a7, 1
-; RV64IM-NEXT:    add a5, a7, a5
-; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    add a4, a7, a4
+; RV64IM-NEXT:    sub a2, a2, a5
 ; RV64IM-NEXT:    sub a1, a1, a6
-; RV64IM-NEXT:    li a4, 23
-; RV64IM-NEXT:    srli a5, a5, 4
-; RV64IM-NEXT:    mul a4, a5, a4
+; RV64IM-NEXT:    li a5, 23
+; RV64IM-NEXT:    srli a4, a4, 4
+; RV64IM-NEXT:    mul a4, a4, a5
 ; RV64IM-NEXT:    sub a3, a3, a4
 ; RV64IM-NEXT:    sd zero, 0(a0)
 ; RV64IM-NEXT:    sd a2, 8(a0)
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 895d84b38be32..2d6434ebdb434 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -162,16 +162,16 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64-LP64F-LP64D-FPELIM:       # %bb.0:
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, -80
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    .cfi_def_cfa_offset 80
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a1, 24(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, sp, 28
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lw a0, 24(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a5, 56(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a6, 64(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a7, 72(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a1, 24(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 32(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 40(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a4, 48(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, sp, 28
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lw a0, 24(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, 80
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    .cfi_def_cfa_offset 0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ret
@@ -186,16 +186,16 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    .cfi_offset s0, -80
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi s0, sp, 32
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    .cfi_def_cfa s0, 64
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, 8(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 12
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a6, 48(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a7, 56(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a2, 16(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a4, 32(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 12
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    .cfi_def_cfa sp, 96
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -209,14 +209,14 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-FPELIM:       # %bb.0:
 ; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 56
-; LP64E-FPELIM-NEXT:    addi a0, sp, 20
-; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
-; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
-; LP64E-FPELIM-NEXT:    lw a0, 16(sp)
 ; LP64E-FPELIM-NEXT:    sd a5, 48(sp)
+; LP64E-FPELIM-NEXT:    sd a1, 16(sp)
 ; LP64E-FPELIM-NEXT:    sd a2, 24(sp)
 ; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
 ; LP64E-FPELIM-NEXT:    sd a4, 40(sp)
+; LP64E-FPELIM-NEXT:    addi a0, sp, 20
+; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
+; LP64E-FPELIM-NEXT:    lw a0, 16(sp)
 ; LP64E-FPELIM-NEXT:    addi sp, sp, 56
 ; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 0
 ; LP64E-FPELIM-NEXT:    ret
@@ -231,14 +231,14 @@ define i32 @va1(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    .cfi_offset s0, -64
 ; LP64E-WITHFP-NEXT:    addi s0, sp, 24
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa s0, 48
-; LP64E-WITHFP-NEXT:    addi a0, s0, 12
-; LP64E-WITHFP-NEXT:    sd a0, -24(s0)
-; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
-; LP64E-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
+; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a2, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
+; LP64E-WITHFP-NEXT:    addi a0, s0, 12
+; LP64E-WITHFP-NEXT:    sd a0, -24(s0)
+; LP64E-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64E-WITHFP-NEXT:    .cfi_def_cfa sp, 72
 ; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
@@ -1348,10 +1348,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 4(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    andi a3, a3, -8
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 4(a3)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 0(a3)
-; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a0
-; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a3
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a3)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 4(a3)
+; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a3
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 32
@@ -1374,10 +1374,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    andi a3, a3, -8
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a4, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 4(a3)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 0(a3)
-; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a0
-; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a3
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a3)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 4(a3)
+; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a3
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
@@ -1399,10 +1399,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 4(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a3, a3, -8
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 4(a3)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 0(a3)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a0
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a3)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 4(a3)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sltu a1, a0, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a2, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 32
@@ -1420,10 +1420,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
 ; ILP32E-FPELIM-NEXT:    andi a3, a3, -8
 ; ILP32E-FPELIM-NEXT:    sw a4, 0(sp)
-; ILP32E-FPELIM-NEXT:    lw a0, 4(a3)
-; ILP32E-FPELIM-NEXT:    lw a3, 0(a3)
-; ILP32E-FPELIM-NEXT:    add a2, a2, a0
-; ILP32E-FPELIM-NEXT:    add a0, a1, a3
+; ILP32E-FPELIM-NEXT:    lw a0, 0(a3)
+; ILP32E-FPELIM-NEXT:    lw a3, 4(a3)
+; ILP32E-FPELIM-NEXT:    add a2, a2, a3
+; ILP32E-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
@@ -1444,10 +1444,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32E-WITHFP-NEXT:    andi a3, a3, -8
 ; ILP32E-WITHFP-NEXT:    sw a4, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw a0, 4(a3)
-; ILP32E-WITHFP-NEXT:    lw a3, 0(a3)
-; ILP32E-WITHFP-NEXT:    add a2, a2, a0
-; ILP32E-WITHFP-NEXT:    add a0, a1, a3
+; ILP32E-WITHFP-NEXT:    lw a0, 0(a3)
+; ILP32E-WITHFP-NEXT:    lw a3, 4(a3)
+; ILP32E-WITHFP-NEXT:    add a2, a2, a3
+; ILP32E-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
@@ -1464,9 +1464,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 24(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a4, 32(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, sp, 31
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, sp, 31
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, a1, a2
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, 64
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ret
 ;
@@ -1482,9 +1482,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a5, 24(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, s0, 15
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 15
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    add a0, a1, a2
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi sp, sp, 80
@@ -1497,9 +1497,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
 ; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
 ; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 23
+; LP64E-FPELIM-NEXT:    addi a0, sp, 23
+; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
 ; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
@@ -1513,9 +1513,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
-; LP64E-WITHFP-NEXT:    addi a3, s0, 15
+; LP64E-WITHFP-NEXT:    addi a0, s0, 15
+; LP64E-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
-; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
 ; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    addi sp, sp, 56
@@ -1603,10 +1603,10 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 20(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    fld fa5, 0(a0)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    fsd fa5, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a0
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 8(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sltu a1, a0, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a2, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
@@ -1668,9 +1668,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 24(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a4, 32(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a5, 40(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, sp, 24
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, sp, 24
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, a1, a2
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, 64
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ret
 ;
@@ -1686,9 +1686,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a5, 24(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, s0, 8
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 8
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    add a0, a1, a2
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi sp, sp, 80
@@ -1701,9 +1701,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-FPELIM-NEXT:    sd a3, 16(sp)
 ; LP64E-FPELIM-NEXT:    sd a4, 24(sp)
 ; LP64E-FPELIM-NEXT:    sd a5, 32(sp)
-; LP64E-FPELIM-NEXT:    addi a3, sp, 16
+; LP64E-FPELIM-NEXT:    addi a0, sp, 16
+; LP64E-FPELIM-NEXT:    sd a0, 0(sp)
 ; LP64E-FPELIM-NEXT:    add a0, a1, a2
-; LP64E-FPELIM-NEXT:    sd a3, 0(sp)
 ; LP64E-FPELIM-NEXT:    addi sp, sp, 40
 ; LP64E-FPELIM-NEXT:    ret
 ;
@@ -1717,9 +1717,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    sd a3, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a5, 24(s0)
-; LP64E-WITHFP-NEXT:    addi a3, s0, 8
+; LP64E-WITHFP-NEXT:    addi a0, s0, 8
+; LP64E-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64E-WITHFP-NEXT:    add a0, a1, a2
-; LP64E-WITHFP-NEXT:    sd a3, -24(s0)
 ; LP64E-WITHFP-NEXT:    ld ra, 16(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    addi sp, sp, 56
@@ -2275,40 +2275,40 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32-ILP32F-FPELIM:       # %bb.0:
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, -64
 ; ILP32-ILP32F-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; ILP32-ILP32F-FPELIM-NEXT:    li a4, 17
-; ILP32-ILP32F-FPELIM-NEXT:    li a5, 16
-; ILP32-ILP32F-FPELIM-NEXT:    li a6, 15
-; ILP32-ILP32F-FPELIM-NEXT:    lui a7, 262236
-; ILP32-ILP32F-FPELIM-NEXT:    lui t0, 377487
-; ILP32-ILP32F-FPELIM-NEXT:    li t1, 14
-; ILP32-ILP32F-FPELIM-NEXT:    lui t2, 262153
-; ILP32-ILP32F-FPELIM-NEXT:    lui t3, 545260
-; ILP32-ILP32F-FPELIM-NEXT:    lui t4, 964690
-; ILP32-ILP32F-FPELIM-NEXT:    lui t5, 335544
-; ILP32-ILP32F-FPELIM-NEXT:    lui t6, 688509
+; ILP32-ILP32F-FPELIM-NEXT:    li a3, 17
+; ILP32-ILP32F-FPELIM-NEXT:    li a4, 16
+; ILP32-ILP32F-FPELIM-NEXT:    li a5, 15
+; ILP32-ILP32F-FPELIM-NEXT:    lui a6, 262236
+; ILP32-ILP32F-FPELIM-NEXT:    lui a7, 377487
+; ILP32-ILP32F-FPELIM-NEXT:    li t0, 14
+; ILP32-ILP32F-FPELIM-NEXT:    lui t1, 262153
+; ILP32-ILP32F-FPELIM-NEXT:    lui t2, 545260
+; ILP32-ILP32F-FPELIM-NEXT:    lui t3, 964690
+; ILP32-ILP32F-FPELIM-NEXT:    lui t4, 335544
+; ILP32-ILP32F-FPELIM-NEXT:    lui t5, 688509
 ; ILP32-ILP32F-FPELIM-NEXT:    li a0, 1
 ; ILP32-ILP32F-FPELIM-NEXT:    li a1, 11
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a2, sp, 32
+; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 24(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    li a3, 12
-; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 20(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 24(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    addi a4, a6, 655
+; ILP32-ILP32F-FPELIM-NEXT:    addi a6, a7, 1475
+; ILP32-ILP32F-FPELIM-NEXT:    sw t0, 0(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a6, 8(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 16(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    li a4, 13
-; ILP32-ILP32F-FPELIM-NEXT:    addi a5, a7, 655
-; ILP32-ILP32F-FPELIM-NEXT:    addi a7, t0, 1475
-; ILP32-ILP32F-FPELIM-NEXT:    sw t1, 0(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a7, 8(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a6, 16(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    li a7, 4
-; ILP32-ILP32F-FPELIM-NEXT:    addi a5, t2, 491
-; ILP32-ILP32F-FPELIM-NEXT:    addi t0, t3, -1967
-; ILP32-ILP32F-FPELIM-NEXT:    addi t1, t4, -328
-; ILP32-ILP32F-FPELIM-NEXT:    addi t2, t5, 1311
-; ILP32-ILP32F-FPELIM-NEXT:    addi a6, t6, -2048
-; ILP32-ILP32F-FPELIM-NEXT:    sw t2, 32(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw t1, 36(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw t0, 40(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    addi a5, t1, 491
+; ILP32-ILP32F-FPELIM-NEXT:    addi a7, t2, -1967
+; ILP32-ILP32F-FPELIM-NEXT:    addi t0, t3, -328
+; ILP32-ILP32F-FPELIM-NEXT:    addi t1, t4, 1311
+; ILP32-ILP32F-FPELIM-NEXT:    addi a6, t5, -2048
+; ILP32-ILP32F-FPELIM-NEXT:    sw t1, 32(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw t0, 36(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a7, 40(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 44(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    li a7, 4
 ; ILP32-ILP32F-FPELIM-NEXT:    call va5_aligned_stack_callee
 ; ILP32-ILP32F-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 64
@@ -2320,40 +2320,40 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; ILP32-ILP32F-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32-ILP32F-WITHFP-NEXT:    addi s0, sp, 64
-; ILP32-ILP32F-WITHFP-NEXT:    li a4, 17
-; ILP32-ILP32F-WITHFP-NEXT:    li a5, 16
-; ILP32-ILP32F-WITHFP-NEXT:    li a6, 15
-; ILP32-ILP32F-WITHFP-NEXT:    lui a7, 262236
-; ILP32-ILP32F-WITHFP-NEXT:    lui t0, 377487
-; ILP32-ILP32F-WITHFP-NEXT:    li t1, 14
-; ILP32-ILP32F-WITHFP-NEXT:    lui t2, 262153
-; ILP32-ILP32F-WITHFP-NEXT:    lui t3, 545260
-; ILP32-ILP32F-WITHFP-NEXT:    lui t4, 964690
-; ILP32-ILP32F-WITHFP-NEXT:    lui t5, 335544
-; ILP32-ILP32F-WITHFP-NEXT:    lui t6, 688509
+; ILP32-ILP32F-WITHFP-NEXT:    li a3, 17
+; ILP32-ILP32F-WITHFP-NEXT:    li a4, 16
+; ILP32-ILP32F-WITHFP-NEXT:    li a5, 15
+; ILP32-ILP32F-WITHFP-NEXT:    lui a6, 262236
+; ILP32-ILP32F-WITHFP-NEXT:    lui a7, 377487
+; ILP32-ILP32F-WITHFP-NEXT:    li t0, 14
+; ILP32-ILP32F-WITHFP-NEXT:    lui t1, 262153
+; ILP32-ILP32F-WITHFP-NEXT:    lui t2, 545260
+; ILP32-ILP32F-WITHFP-NEXT:    lui t3, 964690
+; ILP32-ILP32F-WITHFP-NEXT:    lui t4, 335544
+; ILP32-ILP32F-WITHFP-NEXT:    lui t5, 688509
 ; ILP32-ILP32F-WITHFP-NEXT:    li a0, 1
 ; ILP32-ILP32F-WITHFP-NEXT:    li a1, 11
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a2, s0, -32
+; ILP32-ILP32F-WITHFP-NEXT:    sw a4, 20(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a3, 24(sp)
 ; ILP32-ILP32F-WITHFP-NEXT:    li a3, 12
-; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 20(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a4, 24(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    addi a4, a6, 655
+; ILP32-ILP32F-WITHFP-NEXT:    addi a6, a7, 1475
+; ILP32-ILP32F-WITHFP-NEXT:    sw t0, 0(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a6, 8(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a4, 12(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 16(sp)
 ; ILP32-ILP32F-WITHFP-NEXT:    li a4, 13
-; ILP32-ILP32F-WITHFP-NEXT:    addi a5, a7, 655
-; ILP32-ILP32F-WITHFP-NEXT:    addi a7, t0, 1475
-; ILP32-ILP32F-WITHFP-NEXT:    sw t1, 0(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a7, 8(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 12(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a6, 16(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    li a7, 4
-; ILP32-ILP32F-WITHFP-NEXT:    addi a5, t2, 491
-; ILP32-ILP32F-WITHFP-NEXT:    addi t0, t3, -1967
-; ILP32-ILP32F-WITHFP-NEXT:    addi t1, t4, -328
-; ILP32-ILP32F-WITHFP-NEXT:    addi t2, t5, 1311
-; ILP32-ILP32F-WITHFP-NEXT:    addi a6, t6, -2048
-; ILP32-ILP32F-WITHFP-NEXT:    sw t2, -32(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    sw t1, -28(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    sw t0, -24(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    addi a5, t1, 491
+; ILP32-ILP32F-WITHFP-NEXT:    addi a7, t2, -1967
+; ILP32-ILP32F-WITHFP-NEXT:    addi t0, t3, -328
+; ILP32-ILP32F-WITHFP-NEXT:    addi t1, t4, 1311
+; ILP32-ILP32F-WITHFP-NEXT:    addi a6, t5, -2048
+; ILP32-ILP32F-WITHFP-NEXT:    sw t1, -32(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    sw t0, -28(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a7, -24(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a5, -20(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    li a7, 4
 ; ILP32-ILP32F-WITHFP-NEXT:    call va5_aligned_stack_callee
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
@@ -2364,40 +2364,40 @@ define void @va5_aligned_stack_caller() nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM:       # %bb.0:
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a5, 262236
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a6, 377487
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a4, 17
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 16
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li t0, 15
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li t1, 14
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t2, 262153
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t3, 545260
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t4, 964690
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t5, 335544
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t6, 688509
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a4, 262236
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a5, 377487
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a3, 17
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a6, 16
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 15
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li t0, 14
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t1, 262153
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t2, 545260
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t3, 964690
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t4, 335544
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t5, 688509
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a0, 1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a1, 11
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, sp, 32
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a6, 20(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 24(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a3, 12
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a7, 20(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 24(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a4, a4, 655
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, a5, 1475
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 0(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a5, 8(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a7, 16(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a4, 13
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, a5, 655
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, a6, 1475
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t1, 0(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a6, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a5, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 16(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 4
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, t2, 491
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t0, t3, -1967
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t1, t4, -328
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t2, t5, 1311
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, t6, -2048
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t2, 32(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t1, 36(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 40(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, t1, 491
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a7, t2, -1967
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t0, t3, -328
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t1, t4, 1311
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, t5, -2048
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t1, 32(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 36(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a7, 40(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a5, 44(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 4
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    call va5_aligned_stack_callee
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 64
@@ -2410,41 +2410,41 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32E-FPELIM-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    li a3, 17
-; ILP32E-FPELIM-NEXT:    li a4, 16
-; ILP32E-FPELIM-NEXT:    li a5, 15
-; ILP32E-FPELIM-NEXT:    lui a6, 262236
-; ILP32E-FPELIM-NEXT:    lui a7, 377487
-; ILP32E-FPELIM-NEXT:    li t0, 14
-; ILP32E-FPELIM-NEXT:    li t1, 4
-; ILP32E-FPELIM-NEXT:    lui t2, 262153
-; ILP32E-FPELIM-NEXT:    lui t3, 545260
-; ILP32E-FPELIM-NEXT:    lui t4, 964690
-; ILP32E-FPELIM-NEXT:    lui t5, 335544
-; ILP32E-FPELIM-NEXT:    lui t6, 688509
+; ILP32E-FPELIM-NEXT:    li a2, 17
+; ILP32E-FPELIM-NEXT:    li a3, 16
+; ILP32E-FPELIM-NEXT:    li a4, 15
+; ILP32E-FPELIM-NEXT:    lui a5, 262236
+; ILP32E-FPELIM-NEXT:    lui a6, 377487
+; ILP32E-FPELIM-NEXT:    li a7, 14
+; ILP32E-FPELIM-NEXT:    li t0, 4
+; ILP32E-FPELIM-NEXT:    lui t1, 262153
+; ILP32E-FPELIM-NEXT:    lui t2, 545260
+; ILP32E-FPELIM-NEXT:    lui t3, 964690
+; ILP32E-FPELIM-NEXT:    lui t4, 335544
+; ILP32E-FPELIM-NEXT:    lui t5, 688509
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 11
+; ILP32E-FPELIM-NEXT:    sw a4, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a2, 24(sp)
 ; ILP32E-FPELIM-NEXT:    addi a2, sp, 32
-; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a3, 24(sp)
+; ILP32E-FPELIM-NEXT:    addi a3, a5, 655
+; ILP32E-FPELIM-NEXT:    addi a4, a6, 1475
+; ILP32E-FPELIM-NEXT:    sw t0, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw a7, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 12
-; ILP32E-FPELIM-NEXT:    addi a4, a6, 655
-; ILP32E-FPELIM-NEXT:    addi a5, a7, 1475
-; ILP32E-FPELIM-NEXT:    sw t1, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw t0, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a5, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
+; ILP32E-FPELIM-NEXT:    addi a4, t1, 491
+; ILP32E-FPELIM-NEXT:    addi a6, t2, -1967
+; ILP32E-FPELIM-NEXT:    addi a7, t3, -328
+; ILP32E-FPELIM-NEXT:    addi t0, t4, 1311
+; ILP32E-FPELIM-NEXT:    addi a5, t5, -2048
+; ILP32E-FPELIM-NEXT:    sw t0, 32(sp)
+; ILP32E-FPELIM-NEXT:    sw a7, 36(sp)
+; ILP32E-FPELIM-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 44(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 13
-; ILP32E-FPELIM-NEXT:    addi a6, t2, 491
-; ILP32E-FPELIM-NEXT:    addi a7, t3, -1967
-; ILP32E-FPELIM-NEXT:    addi t0, t4, -328
-; ILP32E-FPELIM-NEXT:    addi t1, t5, 1311
-; ILP32E-FPELIM-NEXT:    addi a5, t6, -2048
-; ILP32E-FPELIM-NEXT:    sw t1, 32(sp)
-; ILP32E-FPELIM-NEXT:    sw t0, 36(sp)
-; ILP32E-FPELIM-NEXT:    sw a7, 40(sp)
-; ILP32E-FPELIM-NEXT:    sw a6, 44(sp)
 ; ILP32E-FPELIM-NEXT:    call va5_aligned_stack_callee
 ; ILP32E-FPELIM-NEXT:    addi sp, s0, -64
 ; ILP32E-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
@@ -2459,41 +2459,41 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32E-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    li a3, 17
-; ILP32E-WITHFP-NEXT:    li a4, 16
-; ILP32E-WITHFP-NEXT:    li a5, 15
-; ILP32E-WITHFP-NEXT:    lui a6, 262236
-; ILP32E-WITHFP-NEXT:    lui a7, 377487
-; ILP32E-WITHFP-NEXT:    li t0, 14
-; ILP32E-WITHFP-NEXT:    li t1, 4
-; ILP32E-WITHFP-NEXT:    lui t2, 262153
-; ILP32E-WITHFP-NEXT:    lui t3, 545260
-; ILP32E-WITHFP-NEXT:    lui t4, 964690
-; ILP32E-WITHFP-NEXT:    lui t5, 335544
-; ILP32E-WITHFP-NEXT:    lui t6, 688509
+; ILP32E-WITHFP-NEXT:    li a2, 17
+; ILP32E-WITHFP-NEXT:    li a3, 16
+; ILP32E-WITHFP-NEXT:    li a4, 15
+; ILP32E-WITHFP-NEXT:    lui a5, 262236
+; ILP32E-WITHFP-NEXT:    lui a6, 377487
+; ILP32E-WITHFP-NEXT:    li a7, 14
+; ILP32E-WITHFP-NEXT:    li t0, 4
+; ILP32E-WITHFP-NEXT:    lui t1, 262153
+; ILP32E-WITHFP-NEXT:    lui t2, 545260
+; ILP32E-WITHFP-NEXT:    lui t3, 964690
+; ILP32E-WITHFP-NEXT:    lui t4, 335544
+; ILP32E-WITHFP-NEXT:    lui t5, 688509
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 11
+; ILP32E-WITHFP-NEXT:    sw a4, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw a2, 24(sp)
 ; ILP32E-WITHFP-NEXT:    addi a2, sp, 32
-; ILP32E-WITHFP-NEXT:    sw a5, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw a4, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw a3, 24(sp)
+; ILP32E-WITHFP-NEXT:    addi a3, a5, 655
+; ILP32E-WITHFP-NEXT:    addi a4, a6, 1475
+; ILP32E-WITHFP-NEXT:    sw t0, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw a7, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 12
-; ILP32E-WITHFP-NEXT:    addi a4, a6, 655
-; ILP32E-WITHFP-NEXT:    addi a5, a7, 1475
-; ILP32E-WITHFP-NEXT:    sw t1, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw t0, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a5, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
+; ILP32E-WITHFP-NEXT:    addi a4, t1, 491
+; ILP32E-WITHFP-NEXT:    addi a6, t2, -1967
+; ILP32E-WITHFP-NEXT:    addi a7, t3, -328
+; ILP32E-WITHFP-NEXT:    addi t0, t4, 1311
+; ILP32E-WITHFP-NEXT:    addi a5, t5, -2048
+; ILP32E-WITHFP-NEXT:    sw t0, 32(sp)
+; ILP32E-WITHFP-NEXT:    sw a7, 36(sp)
+; ILP32E-WITHFP-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 44(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 13
-; ILP32E-WITHFP-NEXT:    addi a6, t2, 491
-; ILP32E-WITHFP-NEXT:    addi a7, t3, -1967
-; ILP32E-WITHFP-NEXT:    addi t0, t4, -328
-; ILP32E-WITHFP-NEXT:    addi t1, t5, 1311
-; ILP32E-WITHFP-NEXT:    addi a5, t6, -2048
-; ILP32E-WITHFP-NEXT:    sw t1, 32(sp)
-; ILP32E-WITHFP-NEXT:    sw t0, 36(sp)
-; ILP32E-WITHFP-NEXT:    sw a7, 40(sp)
-; ILP32E-WITHFP-NEXT:    sw a6, 44(sp)
 ; ILP32E-WITHFP-NEXT:    call va5_aligned_stack_callee
 ; ILP32E-WITHFP-NEXT:    addi sp, s0, -64
 ; ILP32E-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
@@ -2505,27 +2505,27 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64-LP64F-LP64D-FPELIM:       # %bb.0:
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, -48
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; LP64-LP64F-LP64D-FPELIM-NEXT:    li t0, 17
-; LP64-LP64F-LP64D-FPELIM-NEXT:    li t1, 16
-; LP64-LP64F-LP64D-FPELIM-NEXT:    li t2, 15
+; LP64-LP64F-LP64D-FPELIM-NEXT:    li a7, 17
+; LP64-LP64F-LP64D-FPELIM-NEXT:    li t0, 16
+; LP64-LP64F-LP64D-FPELIM-NEXT:    li t1, 15
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a2, %hi(.LCPI11_0)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a3, %hi(.LCPI11_1)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a6, %hi(.LCPI11_2)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui t3, 2384
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui t2, 2384
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a0, 1
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a1, 11
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a4, 12
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a5, 13
-; LP64-LP64F-LP64D-FPELIM-NEXT:    li a7, 14
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld t4, %lo(.LCPI11_0)(a2)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld t3, %lo(.LCPI11_0)(a2)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(a3)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(a6)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a6, t3, 761
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a6, t2, 761
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a6, a6, 11
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t4, 0(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t2, 8(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t1, 16(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t0, 24(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t3, 0(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t1, 8(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t0, 16(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a7, 24(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    li a7, 14
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    call va5_aligned_stack_callee
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, 48
@@ -2537,27 +2537,27 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi s0, sp, 48
-; LP64-LP64F-LP64D-WITHFP-NEXT:    li t0, 17
-; LP64-LP64F-LP64D-WITHFP-NEXT:    li t1, 16
-; LP64-LP64F-LP64D-WITHFP-NEXT:    li t2, 15
+; LP64-LP64F-LP64D-WITHFP-NEXT:    li a7, 17
+; LP64-LP64F-LP64D-WITHFP-NEXT:    li t0, 16
+; LP64-LP64F-LP64D-WITHFP-NEXT:    li t1, 15
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a2, %hi(.LCPI11_0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a3, %hi(.LCPI11_1)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a6, %hi(.LCPI11_2)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui t3, 2384
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui t2, 2384
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a0, 1
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a1, 11
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a4, 12
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a5, 13
-; LP64-LP64F-LP64D-WITHFP-NEXT:    li a7, 14
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld t4, %lo(.LCPI11_0)(a2)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld t3, %lo(.LCPI11_0)(a2)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(a3)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(a6)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a6, t3, 761
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a6, t2, 761
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a6, a6, 11
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t4, 0(sp)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t2, 8(sp)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t1, 16(sp)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t0, 24(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t3, 0(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t1, 8(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t0, 16(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a7, 24(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    li a7, 14
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    call va5_aligned_stack_callee
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -2570,27 +2570,27 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64E-FPELIM-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
 ; LP64E-FPELIM-NEXT:    li a2, 17
 ; LP64E-FPELIM-NEXT:    li a3, 16
-; LP64E-FPELIM-NEXT:    li a6, 15
-; LP64E-FPELIM-NEXT:    lui a7, %hi(.LCPI11_0)
-; LP64E-FPELIM-NEXT:    li t0, 14
-; LP64E-FPELIM-NEXT:    lui t1, 2384
-; LP64E-FPELIM-NEXT:    lui t2, %hi(.LCPI11_1)
-; LP64E-FPELIM-NEXT:    lui t3, %hi(.LCPI11_2)
+; LP64E-FPELIM-NEXT:    li a5, 15
+; LP64E-FPELIM-NEXT:    lui a6, %hi(.LCPI11_0)
+; LP64E-FPELIM-NEXT:    li a7, 14
+; LP64E-FPELIM-NEXT:    lui t0, 2384
+; LP64E-FPELIM-NEXT:    lui t1, %hi(.LCPI11_1)
+; LP64E-FPELIM-NEXT:    lui t2, %hi(.LCPI11_2)
 ; LP64E-FPELIM-NEXT:    li a0, 1
 ; LP64E-FPELIM-NEXT:    li a1, 11
-; LP64E-FPELIM-NEXT:    li a4, 12
 ; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
 ; LP64E-FPELIM-NEXT:    sd a2, 40(sp)
+; LP64E-FPELIM-NEXT:    li a4, 12
+; LP64E-FPELIM-NEXT:    ld a6, %lo(.LCPI11_0)(a6)
+; LP64E-FPELIM-NEXT:    addiw t0, t0, 761
+; LP64E-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(t1)
+; LP64E-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(t2)
+; LP64E-FPELIM-NEXT:    slli t0, t0, 11
+; LP64E-FPELIM-NEXT:    sd t0, 0(sp)
+; LP64E-FPELIM-NEXT:    sd a7, 8(sp)
+; LP64E-FPELIM-NEXT:    sd a6, 16(sp)
+; LP64E-FPELIM-NEXT:    sd a5, 24(sp)
 ; LP64E-FPELIM-NEXT:    li a5, 13
-; LP64E-FPELIM-NEXT:    ld a7, %lo(.LCPI11_0)(a7)
-; LP64E-FPELIM-NEXT:    addiw t1, t1, 761
-; LP64E-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(t2)
-; LP64E-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(t3)
-; LP64E-FPELIM-NEXT:    slli t1, t1, 11
-; LP64E-FPELIM-NEXT:    sd t1, 0(sp)
-; LP64E-FPELIM-NEXT:    sd t0, 8(sp)
-; LP64E-FPELIM-NEXT:    sd a7, 16(sp)
-; LP64E-FPELIM-NEXT:    sd a6, 24(sp)
 ; LP64E-FPELIM-NEXT:    call va5_aligned_stack_callee
 ; LP64E-FPELIM-NEXT:    ld ra, 48(sp) # 8-byte Folded Reload
 ; LP64E-FPELIM-NEXT:    addi sp, sp, 56
@@ -2604,27 +2604,27 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64E-WITHFP-NEXT:    addi s0, sp, 64
 ; LP64E-WITHFP-NEXT:    li a2, 17
 ; LP64E-WITHFP-NEXT:    li a3, 16
-; LP64E-WITHFP-NEXT:    li a6, 15
-; LP64E-WITHFP-NEXT:    lui a7, %hi(.LCPI11_0)
-; LP64E-WITHFP-NEXT:    li t0, 14
-; LP64E-WITHFP-NEXT:    lui t1, 2384
-; LP64E-WITHFP-NEXT:    lui t2, %hi(.LCPI11_1)
-; LP64E-WITHFP-NEXT:    lui t3, %hi(.LCPI11_2)
+; LP64E-WITHFP-NEXT:    li a5, 15
+; LP64E-WITHFP-NEXT:    lui a6, %hi(.LCPI11_0)
+; LP64E-WITHFP-NEXT:    li a7, 14
+; LP64E-WITHFP-NEXT:    lui t0, 2384
+; LP64E-WITHFP-NEXT:    lui t1, %hi(.LCPI11_1)
+; LP64E-WITHFP-NEXT:    lui t2, %hi(.LCPI11_2)
 ; LP64E-WITHFP-NEXT:    li a0, 1
 ; LP64E-WITHFP-NEXT:    li a1, 11
-; LP64E-WITHFP-NEXT:    li a4, 12
 ; LP64E-WITHFP-NEXT:    sd a3, 32(sp)
 ; LP64E-WITHFP-NEXT:    sd a2, 40(sp)
+; LP64E-WITHFP-NEXT:    li a4, 12
+; LP64E-WITHFP-NEXT:    ld a6, %lo(.LCPI11_0)(a6)
+; LP64E-WITHFP-NEXT:    addiw t0, t0, 761
+; LP64E-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(t1)
+; LP64E-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(t2)
+; LP64E-WITHFP-NEXT:    slli t0, t0, 11
+; LP64E-WITHFP-NEXT:    sd t0, 0(sp)
+; LP64E-WITHFP-NEXT:    sd a7, 8(sp)
+; LP64E-WITHFP-NEXT:    sd a6, 16(sp)
+; LP64E-WITHFP-NEXT:    sd a5, 24(sp)
 ; LP64E-WITHFP-NEXT:    li a5, 13
-; LP64E-WITHFP-NEXT:    ld a7, %lo(.LCPI11_0)(a7)
-; LP64E-WITHFP-NEXT:    addiw t1, t1, 761
-; LP64E-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(t2)
-; LP64E-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(t3)
-; LP64E-WITHFP-NEXT:    slli t1, t1, 11
-; LP64E-WITHFP-NEXT:    sd t1, 0(sp)
-; LP64E-WITHFP-NEXT:    sd t0, 8(sp)
-; LP64E-WITHFP-NEXT:    sd a7, 16(sp)
-; LP64E-WITHFP-NEXT:    sd a6, 24(sp)
 ; LP64E-WITHFP-NEXT:    call va5_aligned_stack_callee
 ; LP64E-WITHFP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -2994,8 +2994,26 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    .cfi_def_cfa_offset 100000080
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a5, 312(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a6, 320(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a7, 328(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a1, 280(a0)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 288(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 296(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a4, 304(a0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 24414
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a0, a0, 284
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
@@ -3003,24 +3021,6 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, sp, a0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lw a0, 280(a0)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a5, 312(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a6, 320(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a7, 328(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 288(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 296(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, sp, a1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a4, 304(a1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a1, 24414
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a1, a1, 336
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    add sp, sp, a1
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    .cfi_def_cfa_offset 0
@@ -3039,18 +3039,18 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a0, 24414
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a0, a0, -1680
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sub sp, sp, a0
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, 8(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 12
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a1, 24414
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sub a1, s0, a1
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -288(a1)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a5, 40(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a6, 48(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a7, 56(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a2, 16(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a4, 32(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 12
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a1, 24414
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sub a1, s0, a1
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -288(a1)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a1, 24414
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a1, a1, -1680
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    add sp, sp, a1
@@ -3070,28 +3070,28 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; LP64E-FPELIM-NEXT:    sub sp, sp, a0
 ; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 100000064
 ; LP64E-FPELIM-NEXT:    lui a0, 24414
-; LP64E-FPELIM-NEXT:    addiw a0, a0, 284
 ; LP64E-FPELIM-NEXT:    add a0, sp, a0
-; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    sd a5, 312(a0)
 ; LP64E-FPELIM-NEXT:    lui a0, 24414
 ; LP64E-FPELIM-NEXT:    add a0, sp, a0
 ; LP64E-FPELIM-NEXT:    sd a1, 280(a0)
 ; LP64E-FPELIM-NEXT:    lui a0, 24414
 ; LP64E-FPELIM-NEXT:    add a0, sp, a0
+; LP64E-FPELIM-NEXT:    sd a2, 288(a0)
+; LP64E-FPELIM-NEXT:    lui a0, 24414
+; LP64E-FPELIM-NEXT:    add a0, sp, a0
+; LP64E-FPELIM-NEXT:    sd a3, 296(a0)
+; LP64E-FPELIM-NEXT:    lui a0, 24414
+; LP64E-FPELIM-NEXT:    add a0, sp, a0
+; LP64E-FPELIM-NEXT:    sd a4, 304(a0)
+; LP64E-FPELIM-NEXT:    lui a0, 24414
+; LP64E-FPELIM-NEXT:    addiw a0, a0, 284
+; LP64E-FPELIM-NEXT:    add a0, sp, a0
+; LP64E-FPELIM-NEXT:    sd a0, 8(sp)
+; LP64E-FPELIM-NEXT:    lui a0, 24414
+; LP64E-FPELIM-NEXT:    add a0, sp, a0
 ; LP64E-FPELIM-NEXT:    lw a0, 280(a0)
 ; LP64E-FPELIM-NEXT:    lui a1, 24414
-; LP64E-FPELIM-NEXT:    add a1, sp, a1
-; LP64E-FPELIM-NEXT:    sd a5, 312(a1)
-; LP64E-FPELIM-NEXT:    lui a1, 24414
-; LP64E-FPELIM-NEXT:    add a1, sp, a1
-; LP64E-FPELIM-NEXT:    sd a2, 288(a1)
-; LP64E-FPELIM-NEXT:    lui a1, 24414
-; LP64E-FPELIM-NEXT:    add a1, sp, a1
-; LP64E-FPELIM-NEXT:    sd a3, 296(a1)
-; LP64E-FPELIM-NEXT:    lui a1, 24414
-; LP64E-FPELIM-NEXT:    add a1, sp, a1
-; LP64E-FPELIM-NEXT:    sd a4, 304(a1)
-; LP64E-FPELIM-NEXT:    lui a1, 24414
 ; LP64E-FPELIM-NEXT:    addiw a1, a1, 320
 ; LP64E-FPELIM-NEXT:    add sp, sp, a1
 ; LP64E-FPELIM-NEXT:    .cfi_def_cfa_offset 0
@@ -3110,16 +3110,16 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; LP64E-WITHFP-NEXT:    lui a0, 24414
 ; LP64E-WITHFP-NEXT:    addiw a0, a0, -1704
 ; LP64E-WITHFP-NEXT:    sub sp, sp, a0
-; LP64E-WITHFP-NEXT:    addi a0, s0, 12
-; LP64E-WITHFP-NEXT:    lui a6, 24414
-; LP64E-WITHFP-NEXT:    sub a6, s0, a6
-; LP64E-WITHFP-NEXT:    sd a0, -288(a6)
-; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
-; LP64E-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a5, 40(s0)
+; LP64E-WITHFP-NEXT:    sd a1, 8(s0)
 ; LP64E-WITHFP-NEXT:    sd a2, 16(s0)
 ; LP64E-WITHFP-NEXT:    sd a3, 24(s0)
 ; LP64E-WITHFP-NEXT:    sd a4, 32(s0)
+; LP64E-WITHFP-NEXT:    addi a0, s0, 12
+; LP64E-WITHFP-NEXT:    lui a1, 24414
+; LP64E-WITHFP-NEXT:    sub a1, s0, a1
+; LP64E-WITHFP-NEXT:    sd a0, -288(a1)
+; LP64E-WITHFP-NEXT:    lw a0, 8(s0)
 ; LP64E-WITHFP-NEXT:    lui a1, 24414
 ; LP64E-WITHFP-NEXT:    addiw a1, a1, -1704
 ; LP64E-WITHFP-NEXT:    add sp, sp, a1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 437b7e557718c..13beb844dec36 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -9,9 +9,9 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
@@ -29,26 +29,26 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -73,9 +73,9 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
@@ -93,26 +93,26 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -137,9 +137,9 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
@@ -157,26 +157,26 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -224,20 +224,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -263,40 +263,40 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a5, 7(a0)
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a3, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a4, a3
-; RV32I-NEXT:    or a4, a1, a6
-; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a5, a5, a3
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a4, a1, 3
 ; RV32I-NEXT:    srl a1, a5, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
 ; RV32I-NEXT:    lbu t0, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    slli a5, a5, 1
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
@@ -360,20 +360,20 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -399,40 +399,40 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a6, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a3, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a4, a3
-; RV32I-NEXT:    or a4, a1, a6
-; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a5, a5, a3
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a4, a1, 3
 ; RV32I-NEXT:    sll a1, a5, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB4_3
 ; RV32I-NEXT:  .LBB4_2:
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
 ; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    srli a5, a5, 1
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
@@ -496,20 +496,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -535,42 +535,41 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 6(a0)
 ; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 2(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, a4
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    slli a4, a5, 16
 ; RV32I-NEXT:    slli a5, a6, 24
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    or a4, a4, a3
 ; RV32I-NEXT:    or a3, a1, a7
 ; RV32I-NEXT:    slli a3, a3, 3
-; RV32I-NEXT:    addi a6, a3, -32
 ; RV32I-NEXT:    sra a1, a4, a3
+; RV32I-NEXT:    addi a6, a3, -32
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a5, a5, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a4, a4, 1
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
@@ -633,54 +632,54 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a5, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a6, a5, 35
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a6, a1
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    lbu t1, 0(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 4(a0)
-; RV64I-NEXT:    lbu t2, 5(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a4
@@ -787,10 +786,10 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    add a1, t2, a1
 ; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srl a7, a5, a0
 ; RV32I-NEXT:    slli t0, a6, 1
@@ -872,54 +871,54 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a5, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a6, a5, 37
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a6, a1
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    lbu t1, 0(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 4(a0)
-; RV64I-NEXT:    lbu t2, 5(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a4
@@ -1016,38 +1015,38 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli a6, a0, 24
-; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    srli a6, a4, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    srli t0, a1, 16
 ; RV32I-NEXT:    srli t1, a1, 24
 ; RV32I-NEXT:    srli t2, a1, 8
-; RV32I-NEXT:    srli t3, a4, 16
-; RV32I-NEXT:    srli t4, a4, 24
-; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t3, a0, 16
+; RV32I-NEXT:    srli t4, a0, 24
+; RV32I-NEXT:    srli t5, a0, 8
 ; RV32I-NEXT:    srli t6, a3, 16
-; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a1, a3, 8
-; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    sb t5, 1(a2)
 ; RV32I-NEXT:    sb t3, 2(a2)
 ; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t6, 6(a2)
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1087,54 +1086,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a5, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a6, a5, 35
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a6, a1
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 10(a0)
-; RV64I-NEXT:    lbu t0, 11(a0)
-; RV64I-NEXT:    lbu t1, 8(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 12(a0)
-; RV64I-NEXT:    lbu t2, 13(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
+; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 12(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    lbu t2, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    srli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a4
@@ -1241,11 +1240,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    sub a1, t2, a1
 ; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sll a7, a5, a0
 ; RV32I-NEXT:    srli t0, a4, 1
 ; RV32I-NEXT:    sll a1, a1, a0
@@ -1326,54 +1325,54 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a5, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a6, a5, 37
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a6, a1
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB9_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB9_3
 ; RV64I-NEXT:  .LBB9_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 10(a0)
-; RV64I-NEXT:    lbu t0, 11(a0)
-; RV64I-NEXT:    lbu t1, 8(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 12(a0)
-; RV64I-NEXT:    lbu t2, 13(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
+; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 12(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    lbu t2, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    srli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a4
@@ -1470,38 +1469,38 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sw a4, 20(sp)
 ; RV32I-NEXT:    sw a5, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli a6, a0, 24
-; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    srli a6, a4, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    srli t0, a1, 16
 ; RV32I-NEXT:    srli t1, a1, 24
 ; RV32I-NEXT:    srli t2, a1, 8
-; RV32I-NEXT:    srli t3, a4, 16
-; RV32I-NEXT:    srli t4, a4, 24
-; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t3, a0, 16
+; RV32I-NEXT:    srli t4, a0, 24
+; RV32I-NEXT:    srli t5, a0, 8
 ; RV32I-NEXT:    srli t6, a3, 16
-; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a1, a3, 8
-; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    sb t5, 1(a2)
 ; RV32I-NEXT:    sb t3, 2(a2)
 ; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t6, 6(a2)
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1542,56 +1541,55 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, t1, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a6, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a7, a4, 35
 ; RV64I-NEXT:    or a4, a6, a3
 ; RV64I-NEXT:    or a3, a7, a1
-; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    sra a1, a4, a3
+; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    bltz a6, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    sraiw a1, a5, 31
 ; RV64I-NEXT:    j .LBB10_3
 ; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    lbu a5, 1(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    lbu t0, 0(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a5, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    not a6, a3
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    srl a0, a0, a3
@@ -1665,17 +1663,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t1, t1, 8
 ; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    or t3, t5, t4
-; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t1
-; RV32I-NEXT:    mv t1, sp
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    mv t4, sp
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or t2, a0, t2
@@ -1684,7 +1682,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    or a6, t2, t0
-; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
@@ -1695,12 +1693,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    slli a0, a1, 3
 ; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    add a1, t1, a1
+; RV32I-NEXT:    add a1, t4, a1
 ; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srl a7, a5, a0
 ; RV32I-NEXT:    slli t0, a6, 1
@@ -1782,56 +1780,55 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 0(a1)
 ; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, t1, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a6, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 5
 ; RV64I-NEXT:    slli a7, a4, 37
 ; RV64I-NEXT:    or a4, a6, a3
 ; RV64I-NEXT:    or a3, a7, a1
-; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    sra a1, a4, a3
+; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    bltz a6, .LBB11_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    sraiw a1, a5, 31
 ; RV64I-NEXT:    j .LBB11_3
 ; RV64I-NEXT:  .LBB11_2:
-; RV64I-NEXT:    lbu a5, 1(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    lbu t0, 0(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a5, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    not a6, a3
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    srl a0, a0, a3
@@ -1927,38 +1924,38 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli a6, a0, 24
-; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    srli a6, a4, 24
+; RV32I-NEXT:    srli a7, a4, 8
 ; RV32I-NEXT:    srli t0, a1, 16
 ; RV32I-NEXT:    srli t1, a1, 24
 ; RV32I-NEXT:    srli t2, a1, 8
-; RV32I-NEXT:    srli t3, a4, 16
-; RV32I-NEXT:    srli t4, a4, 24
-; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t3, a0, 16
+; RV32I-NEXT:    srli t4, a0, 24
+; RV32I-NEXT:    srli t5, a0, 8
 ; RV32I-NEXT:    srli t6, a3, 16
-; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    srli a4, a3, 24
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a1, a3, 8
-; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    sb t5, 1(a2)
 ; RV32I-NEXT:    sb t3, 2(a2)
 ; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
 ; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t6, 6(a2)
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -2065,13 +2062,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
@@ -2088,8 +2085,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -2108,21 +2105,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    add a1, s6, a1
 ; RV64I-NEXT:    andi a0, a4, 56
+; RV64I-NEXT:    xori a5, a0, 63
 ; RV64I-NEXT:    ld a3, 0(a1)
-; RV64I-NEXT:    ld a5, 8(a1)
-; RV64I-NEXT:    ld a6, 16(a1)
-; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld a6, 8(a1)
+; RV64I-NEXT:    ld a7, 16(a1)
 ; RV64I-NEXT:    ld t0, 24(a1)
-; RV64I-NEXT:    srl a0, a5, a4
-; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a0, a6, a4
+; RV64I-NEXT:    slli t1, a7, 1
 ; RV64I-NEXT:    srl a1, a3, a4
-; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    srl a3, a6, a4
-; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    slli a6, a6, 1
+; RV64I-NEXT:    srl a3, a7, a4
+; RV64I-NEXT:    slli a7, t0, 1
 ; RV64I-NEXT:    srl t0, t0, a4
-; RV64I-NEXT:    sll a4, t1, a7
-; RV64I-NEXT:    sll a5, a5, a7
-; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    sll a4, t1, a5
+; RV64I-NEXT:    sll a6, a6, a5
+; RV64I-NEXT:    sll a5, a7, a5
 ; RV64I-NEXT:    srli a7, t0, 56
 ; RV64I-NEXT:    srli t1, t0, 48
 ; RV64I-NEXT:    srli t2, t0, 40
@@ -2131,8 +2128,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli t5, t0, 16
 ; RV64I-NEXT:    srli t6, t0, 8
 ; RV64I-NEXT:    or a4, a0, a4
-; RV64I-NEXT:    or a5, a1, a5
-; RV64I-NEXT:    or a6, a3, a6
+; RV64I-NEXT:    or a6, a1, a6
+; RV64I-NEXT:    or a5, a3, a5
 ; RV64I-NEXT:    sb t3, 28(a2)
 ; RV64I-NEXT:    sb t2, 29(a2)
 ; RV64I-NEXT:    sb t1, 30(a2)
@@ -2141,20 +2138,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
-; RV64I-NEXT:    srli a7, a6, 56
-; RV64I-NEXT:    srli t0, a6, 48
-; RV64I-NEXT:    srli t1, a6, 40
-; RV64I-NEXT:    srli t2, a6, 32
-; RV64I-NEXT:    srli t3, a6, 24
-; RV64I-NEXT:    srli t4, a6, 16
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    srli t5, a5, 56
-; RV64I-NEXT:    srli t6, a5, 48
-; RV64I-NEXT:    srli s0, a5, 40
-; RV64I-NEXT:    srli s1, a5, 32
-; RV64I-NEXT:    srli s2, a5, 24
-; RV64I-NEXT:    srli s3, a5, 16
+; RV64I-NEXT:    srli a7, a5, 56
+; RV64I-NEXT:    srli t0, a5, 48
+; RV64I-NEXT:    srli t1, a5, 40
+; RV64I-NEXT:    srli t2, a5, 32
+; RV64I-NEXT:    srli t3, a5, 24
+; RV64I-NEXT:    srli t4, a5, 16
 ; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    srli t5, a6, 56
+; RV64I-NEXT:    srli t6, a6, 48
+; RV64I-NEXT:    srli s0, a6, 40
+; RV64I-NEXT:    srli s1, a6, 32
+; RV64I-NEXT:    srli s2, a6, 24
+; RV64I-NEXT:    srli s3, a6, 16
+; RV64I-NEXT:    srli a6, a6, 8
 ; RV64I-NEXT:    srli s4, a4, 56
 ; RV64I-NEXT:    srli s5, a4, 48
 ; RV64I-NEXT:    srli s6, a4, 40
@@ -2164,7 +2161,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a7, 23(a2)
 ; RV64I-NEXT:    srli a7, a4, 32
 ; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    sb a5, 17(a2)
 ; RV64I-NEXT:    sb t4, 18(a2)
 ; RV64I-NEXT:    sb t3, 19(a2)
 ; RV64I-NEXT:    srli a3, a4, 24
@@ -2172,10 +2169,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s0, 5(a2)
 ; RV64I-NEXT:    sb t6, 6(a2)
 ; RV64I-NEXT:    sb t5, 7(a2)
-; RV64I-NEXT:    srli a6, a4, 16
+; RV64I-NEXT:    srli a5, a4, 16
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
 ; RV64I-NEXT:    sb s3, 2(a2)
 ; RV64I-NEXT:    sb s2, 3(a2)
 ; RV64I-NEXT:    sb a7, 12(a2)
@@ -2184,7 +2181,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s4, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a6, 10(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -2543,13 +2540,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
@@ -2566,8 +2563,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -2587,24 +2584,24 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    andi a0, a3, 32
 ; RV64I-NEXT:    add a1, s6, a1
-; RV64I-NEXT:    ld a4, 0(a1)
-; RV64I-NEXT:    ld a5, 8(a1)
-; RV64I-NEXT:    ld a6, 16(a1)
-; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    xori a4, a0, 63
+; RV64I-NEXT:    ld a5, 0(a1)
+; RV64I-NEXT:    ld a6, 8(a1)
+; RV64I-NEXT:    ld a7, 16(a1)
 ; RV64I-NEXT:    ld t0, 24(a1)
-; RV64I-NEXT:    srl a0, a5, a3
-; RV64I-NEXT:    slli t1, a6, 1
-; RV64I-NEXT:    srl a1, a4, a3
-; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    srl a4, a6, a3
-; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    srl a0, a6, a3
+; RV64I-NEXT:    slli t1, a7, 1
+; RV64I-NEXT:    srl a1, a5, a3
+; RV64I-NEXT:    slli a6, a6, 1
+; RV64I-NEXT:    srl a5, a7, a3
+; RV64I-NEXT:    slli a7, t0, 1
 ; RV64I-NEXT:    srl a3, t0, a3
-; RV64I-NEXT:    sll t0, t1, a7
-; RV64I-NEXT:    sll a5, a5, a7
-; RV64I-NEXT:    sll a6, a6, a7
-; RV64I-NEXT:    srli a7, a4, 24
-; RV64I-NEXT:    srli t1, a4, 16
-; RV64I-NEXT:    srli t2, a4, 8
+; RV64I-NEXT:    sll t0, t1, a4
+; RV64I-NEXT:    sll a6, a6, a4
+; RV64I-NEXT:    sll a4, a7, a4
+; RV64I-NEXT:    srli a7, a5, 24
+; RV64I-NEXT:    srli t1, a5, 16
+; RV64I-NEXT:    srli t2, a5, 8
 ; RV64I-NEXT:    srli t3, a3, 56
 ; RV64I-NEXT:    srli t4, a3, 48
 ; RV64I-NEXT:    srli t5, a3, 40
@@ -2616,19 +2613,19 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    srli s4, a1, 16
 ; RV64I-NEXT:    srli s5, a1, 8
 ; RV64I-NEXT:    srli s6, a0, 24
-; RV64I-NEXT:    or a6, a4, a6
-; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb a7, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 16
+; RV64I-NEXT:    srli a5, a0, 16
 ; RV64I-NEXT:    sb t6, 28(a2)
 ; RV64I-NEXT:    sb t5, 29(a2)
 ; RV64I-NEXT:    sb t4, 30(a2)
 ; RV64I-NEXT:    sb t3, 31(a2)
 ; RV64I-NEXT:    srli a7, a0, 8
 ; RV64I-NEXT:    or t0, a0, t0
-; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    or a6, a1, a6
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s2, 25(a2)
 ; RV64I-NEXT:    sb s1, 26(a2)
@@ -2639,16 +2636,16 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb s3, 3(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a7, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb s6, 11(a2)
-; RV64I-NEXT:    srli a0, a6, 56
-; RV64I-NEXT:    srli a1, a6, 48
-; RV64I-NEXT:    srli a3, a6, 40
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    srli a7, a5, 48
-; RV64I-NEXT:    srli t1, a5, 40
-; RV64I-NEXT:    srli a5, a5, 32
+; RV64I-NEXT:    srli a0, a4, 56
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    srli a5, a6, 56
+; RV64I-NEXT:    srli a7, a6, 48
+; RV64I-NEXT:    srli t1, a6, 40
+; RV64I-NEXT:    srli a6, a6, 32
 ; RV64I-NEXT:    srli t2, t0, 56
 ; RV64I-NEXT:    srli t3, t0, 48
 ; RV64I-NEXT:    srli t4, t0, 40
@@ -2657,10 +2654,10 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb a3, 21(a2)
 ; RV64I-NEXT:    sb a1, 22(a2)
 ; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    sb a6, 4(a2)
 ; RV64I-NEXT:    sb t1, 5(a2)
 ; RV64I-NEXT:    sb a7, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    sb a5, 7(a2)
 ; RV64I-NEXT:    sb t0, 12(a2)
 ; RV64I-NEXT:    sb t4, 13(a2)
 ; RV64I-NEXT:    sb t3, 14(a2)
@@ -2797,13 +2794,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sw t0, 12(sp)
 ; RV32I-NEXT:    sw t1, 16(sp)
 ; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
 ; RV32I-NEXT:    lw a1, 0(t6)
 ; RV32I-NEXT:    lw a0, 4(t6)
 ; RV32I-NEXT:    lw a4, 8(t6)
 ; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
 ; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
@@ -3001,9 +2998,9 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd a5, 8(sp)
 ; RV64I-NEXT:    sd a3, 16(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t3)
-; RV64I-NEXT:    ld a0, 8(t3)
 ; RV64I-NEXT:    ld a1, 0(t3)
+; RV64I-NEXT:    ld a0, 8(t3)
+; RV64I-NEXT:    ld a4, 16(t3)
 ; RV64I-NEXT:    ld a3, 24(t3)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
@@ -3197,13 +3194,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sw t0, 12(sp)
 ; RV32I-NEXT:    sw t1, 16(sp)
 ; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(t6)
-; RV32I-NEXT:    lw a5, 20(t6)
-; RV32I-NEXT:    lw a7, 24(t6)
 ; RV32I-NEXT:    lw a1, 0(t6)
 ; RV32I-NEXT:    lw a0, 4(t6)
 ; RV32I-NEXT:    lw a4, 8(t6)
 ; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
 ; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
@@ -3380,13 +3377,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
@@ -3403,8 +3400,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -3423,11 +3420,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    sub a1, s6, a1
 ; RV64I-NEXT:    andi a3, a0, 56
+; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    ld a4, 0(a1)
 ; RV64I-NEXT:    ld a5, 8(a1)
 ; RV64I-NEXT:    ld a6, 16(a1)
 ; RV64I-NEXT:    ld a1, 24(a1)
-; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    sll a7, a5, a0
 ; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    sll t1, a1, a0
@@ -3858,13 +3855,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
@@ -3881,8 +3878,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -3902,25 +3899,25 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    andi a0, a3, 32
 ; RV64I-NEXT:    sub a1, s6, a1
-; RV64I-NEXT:    ld a4, 0(a1)
-; RV64I-NEXT:    ld a5, 8(a1)
-; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    xori a4, a0, 63
+; RV64I-NEXT:    ld a5, 0(a1)
+; RV64I-NEXT:    ld a6, 8(a1)
+; RV64I-NEXT:    ld a7, 16(a1)
 ; RV64I-NEXT:    ld a1, 24(a1)
-; RV64I-NEXT:    xori a7, a0, 63
-; RV64I-NEXT:    sll a0, a5, a3
-; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll a0, a6, a3
+; RV64I-NEXT:    srli t0, a5, 1
 ; RV64I-NEXT:    sll a1, a1, a3
-; RV64I-NEXT:    srli t1, a6, 1
-; RV64I-NEXT:    sll a6, a6, a3
-; RV64I-NEXT:    srli a5, a5, 1
-; RV64I-NEXT:    sll a3, a4, a3
-; RV64I-NEXT:    srl a4, t0, a7
-; RV64I-NEXT:    srl t0, t1, a7
-; RV64I-NEXT:    srl a5, a5, a7
-; RV64I-NEXT:    srli a7, a6, 56
-; RV64I-NEXT:    srli t1, a6, 48
-; RV64I-NEXT:    srli t2, a6, 40
-; RV64I-NEXT:    srli t3, a6, 32
+; RV64I-NEXT:    srli t1, a7, 1
+; RV64I-NEXT:    sll a7, a7, a3
+; RV64I-NEXT:    srli a6, a6, 1
+; RV64I-NEXT:    sll a3, a5, a3
+; RV64I-NEXT:    srl a5, t0, a4
+; RV64I-NEXT:    srl t0, t1, a4
+; RV64I-NEXT:    srl a4, a6, a4
+; RV64I-NEXT:    srli a6, a7, 56
+; RV64I-NEXT:    srli t1, a7, 48
+; RV64I-NEXT:    srli t2, a7, 40
+; RV64I-NEXT:    srli t3, a7, 32
 ; RV64I-NEXT:    srli t4, a1, 56
 ; RV64I-NEXT:    srli t5, a1, 48
 ; RV64I-NEXT:    srli t6, a1, 40
@@ -3933,19 +3930,19 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    srli s6, a3, 16
 ; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    srli t0, a3, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    srli a6, a0, 56
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    srli a7, a0, 56
 ; RV64I-NEXT:    sb t3, 20(a2)
 ; RV64I-NEXT:    sb t2, 21(a2)
 ; RV64I-NEXT:    sb t1, 22(a2)
-; RV64I-NEXT:    sb a7, 23(a2)
-; RV64I-NEXT:    srli a7, a0, 48
+; RV64I-NEXT:    sb a6, 23(a2)
+; RV64I-NEXT:    srli a6, a0, 48
 ; RV64I-NEXT:    sb s0, 28(a2)
 ; RV64I-NEXT:    sb t6, 29(a2)
 ; RV64I-NEXT:    sb t5, 30(a2)
 ; RV64I-NEXT:    sb t4, 31(a2)
 ; RV64I-NEXT:    srli t1, a0, 40
-; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    or a5, a0, a5
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sb s4, 4(a2)
 ; RV64I-NEXT:    sb s3, 5(a2)
@@ -3957,18 +3954,18 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    sb s5, 3(a2)
 ; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    sb t1, 13(a2)
-; RV64I-NEXT:    sb a7, 14(a2)
-; RV64I-NEXT:    sb a6, 15(a2)
-; RV64I-NEXT:    srli a0, a5, 24
-; RV64I-NEXT:    srli a3, a5, 16
-; RV64I-NEXT:    srli a6, a5, 8
+; RV64I-NEXT:    sb a6, 14(a2)
+; RV64I-NEXT:    sb a7, 15(a2)
+; RV64I-NEXT:    srli a0, a4, 24
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    srli a6, a4, 8
 ; RV64I-NEXT:    srli a7, a1, 24
 ; RV64I-NEXT:    srli t0, a1, 16
 ; RV64I-NEXT:    srli t1, a1, 8
-; RV64I-NEXT:    srli t2, a4, 24
-; RV64I-NEXT:    srli t3, a4, 16
-; RV64I-NEXT:    srli t4, a4, 8
-; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    srli t2, a5, 24
+; RV64I-NEXT:    srli t3, a5, 16
+; RV64I-NEXT:    srli t4, a5, 8
+; RV64I-NEXT:    sb a4, 16(a2)
 ; RV64I-NEXT:    sb a6, 17(a2)
 ; RV64I-NEXT:    sb a3, 18(a2)
 ; RV64I-NEXT:    sb a0, 19(a2)
@@ -3976,7 +3973,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    sb t1, 25(a2)
 ; RV64I-NEXT:    sb t0, 26(a2)
 ; RV64I-NEXT:    sb a7, 27(a2)
-; RV64I-NEXT:    sb a4, 8(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
 ; RV64I-NEXT:    sb t4, 9(a2)
 ; RV64I-NEXT:    sb t3, 10(a2)
 ; RV64I-NEXT:    sb t2, 11(a2)
@@ -4112,13 +4109,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV32I-NEXT:    sw t0, 44(sp)
 ; RV32I-NEXT:    sw t1, 48(sp)
 ; RV32I-NEXT:    sw a5, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t2)
-; RV32I-NEXT:    lw a5, 20(t2)
-; RV32I-NEXT:    lw a7, 24(t2)
 ; RV32I-NEXT:    lw a1, 0(t2)
 ; RV32I-NEXT:    lw a0, 4(t2)
 ; RV32I-NEXT:    lw a4, 8(t2)
 ; RV32I-NEXT:    lw a3, 12(t2)
+; RV32I-NEXT:    lw a6, 16(t2)
+; RV32I-NEXT:    lw a5, 20(t2)
+; RV32I-NEXT:    lw a7, 24(t2)
 ; RV32I-NEXT:    lw t0, 28(t2)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
@@ -4316,9 +4313,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    sd a5, 40(sp)
 ; RV64I-NEXT:    sd a3, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a0, 8(t2)
 ; RV64I-NEXT:    ld a1, 0(t2)
+; RV64I-NEXT:    ld a0, 8(t2)
+; RV64I-NEXT:    ld a4, 16(t2)
 ; RV64I-NEXT:    ld a3, 24(t2)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
@@ -4512,13 +4509,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV32I-NEXT:    sw t0, 44(sp)
 ; RV32I-NEXT:    sw t1, 48(sp)
 ; RV32I-NEXT:    sw a5, 52(sp)
-; RV32I-NEXT:    lw a6, 16(t2)
-; RV32I-NEXT:    lw a5, 20(t2)
-; RV32I-NEXT:    lw a7, 24(t2)
 ; RV32I-NEXT:    lw a1, 0(t2)
 ; RV32I-NEXT:    lw a0, 4(t2)
 ; RV32I-NEXT:    lw a4, 8(t2)
 ; RV32I-NEXT:    lw a3, 12(t2)
+; RV32I-NEXT:    lw a6, 16(t2)
+; RV32I-NEXT:    lw a5, 20(t2)
+; RV32I-NEXT:    lw a7, 24(t2)
 ; RV32I-NEXT:    lw t0, 28(t2)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
@@ -4695,13 +4692,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, s7
@@ -4714,8 +4711,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -4739,21 +4736,21 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    add a1, s6, a1
 ; RV64I-NEXT:    andi a0, a4, 56
+; RV64I-NEXT:    xori a5, a0, 63
 ; RV64I-NEXT:    ld a3, 0(a1)
-; RV64I-NEXT:    ld a5, 8(a1)
-; RV64I-NEXT:    ld a6, 16(a1)
-; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld a6, 8(a1)
+; RV64I-NEXT:    ld a7, 16(a1)
 ; RV64I-NEXT:    ld t0, 24(a1)
-; RV64I-NEXT:    srl a0, a5, a4
-; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a0, a6, a4
+; RV64I-NEXT:    slli t1, a7, 1
 ; RV64I-NEXT:    srl a1, a3, a4
-; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    srl a3, a6, a4
-; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    slli a6, a6, 1
+; RV64I-NEXT:    srl a3, a7, a4
+; RV64I-NEXT:    slli a7, t0, 1
 ; RV64I-NEXT:    sra t0, t0, a4
-; RV64I-NEXT:    sll a4, t1, a7
-; RV64I-NEXT:    sll a5, a5, a7
-; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    sll a4, t1, a5
+; RV64I-NEXT:    sll a6, a6, a5
+; RV64I-NEXT:    sll a5, a7, a5
 ; RV64I-NEXT:    srli a7, t0, 56
 ; RV64I-NEXT:    srli t1, t0, 48
 ; RV64I-NEXT:    srli t2, t0, 40
@@ -4762,8 +4759,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli t5, t0, 16
 ; RV64I-NEXT:    srli t6, t0, 8
 ; RV64I-NEXT:    or a4, a0, a4
-; RV64I-NEXT:    or a5, a1, a5
-; RV64I-NEXT:    or a6, a3, a6
+; RV64I-NEXT:    or a6, a1, a6
+; RV64I-NEXT:    or a5, a3, a5
 ; RV64I-NEXT:    sb t3, 28(a2)
 ; RV64I-NEXT:    sb t2, 29(a2)
 ; RV64I-NEXT:    sb t1, 30(a2)
@@ -4772,20 +4769,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
-; RV64I-NEXT:    srli a7, a6, 56
-; RV64I-NEXT:    srli t0, a6, 48
-; RV64I-NEXT:    srli t1, a6, 40
-; RV64I-NEXT:    srli t2, a6, 32
-; RV64I-NEXT:    srli t3, a6, 24
-; RV64I-NEXT:    srli t4, a6, 16
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    srli t5, a5, 56
-; RV64I-NEXT:    srli t6, a5, 48
-; RV64I-NEXT:    srli s0, a5, 40
-; RV64I-NEXT:    srli s1, a5, 32
-; RV64I-NEXT:    srli s2, a5, 24
-; RV64I-NEXT:    srli s3, a5, 16
+; RV64I-NEXT:    srli a7, a5, 56
+; RV64I-NEXT:    srli t0, a5, 48
+; RV64I-NEXT:    srli t1, a5, 40
+; RV64I-NEXT:    srli t2, a5, 32
+; RV64I-NEXT:    srli t3, a5, 24
+; RV64I-NEXT:    srli t4, a5, 16
 ; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    srli t5, a6, 56
+; RV64I-NEXT:    srli t6, a6, 48
+; RV64I-NEXT:    srli s0, a6, 40
+; RV64I-NEXT:    srli s1, a6, 32
+; RV64I-NEXT:    srli s2, a6, 24
+; RV64I-NEXT:    srli s3, a6, 16
+; RV64I-NEXT:    srli a6, a6, 8
 ; RV64I-NEXT:    srli s4, a4, 56
 ; RV64I-NEXT:    srli s5, a4, 48
 ; RV64I-NEXT:    srli s6, a4, 40
@@ -4795,7 +4792,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a7, 23(a2)
 ; RV64I-NEXT:    srli a7, a4, 32
 ; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    sb a5, 17(a2)
 ; RV64I-NEXT:    sb t4, 18(a2)
 ; RV64I-NEXT:    sb t3, 19(a2)
 ; RV64I-NEXT:    srli a3, a4, 24
@@ -4803,10 +4800,10 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s0, 5(a2)
 ; RV64I-NEXT:    sb t6, 6(a2)
 ; RV64I-NEXT:    sb t5, 7(a2)
-; RV64I-NEXT:    srli a6, a4, 16
+; RV64I-NEXT:    srli a5, a4, 16
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    sb a6, 1(a2)
 ; RV64I-NEXT:    sb s3, 2(a2)
 ; RV64I-NEXT:    sb s2, 3(a2)
 ; RV64I-NEXT:    sb a7, 12(a2)
@@ -4815,7 +4812,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s4, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a6, 10(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
 ; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
@@ -5175,13 +5172,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, s7
@@ -5194,8 +5191,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -5220,24 +5217,24 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    andi a1, a1, 24
 ; RV64I-NEXT:    andi a0, a3, 32
 ; RV64I-NEXT:    add a1, s6, a1
-; RV64I-NEXT:    ld a4, 0(a1)
-; RV64I-NEXT:    ld a5, 8(a1)
-; RV64I-NEXT:    ld a6, 16(a1)
-; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    xori a4, a0, 63
+; RV64I-NEXT:    ld a5, 0(a1)
+; RV64I-NEXT:    ld a6, 8(a1)
+; RV64I-NEXT:    ld a7, 16(a1)
 ; RV64I-NEXT:    ld t0, 24(a1)
-; RV64I-NEXT:    srl a0, a5, a3
-; RV64I-NEXT:    slli t1, a6, 1
-; RV64I-NEXT:    srl a1, a4, a3
-; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    srl a4, a6, a3
-; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    srl a0, a6, a3
+; RV64I-NEXT:    slli t1, a7, 1
+; RV64I-NEXT:    srl a1, a5, a3
+; RV64I-NEXT:    slli a6, a6, 1
+; RV64I-NEXT:    srl a5, a7, a3
+; RV64I-NEXT:    slli a7, t0, 1
 ; RV64I-NEXT:    sra a3, t0, a3
-; RV64I-NEXT:    sll t0, t1, a7
-; RV64I-NEXT:    sll a5, a5, a7
-; RV64I-NEXT:    sll a6, a6, a7
-; RV64I-NEXT:    srli a7, a4, 24
-; RV64I-NEXT:    srli t1, a4, 16
-; RV64I-NEXT:    srli t2, a4, 8
+; RV64I-NEXT:    sll t0, t1, a4
+; RV64I-NEXT:    sll a6, a6, a4
+; RV64I-NEXT:    sll a4, a7, a4
+; RV64I-NEXT:    srli a7, a5, 24
+; RV64I-NEXT:    srli t1, a5, 16
+; RV64I-NEXT:    srli t2, a5, 8
 ; RV64I-NEXT:    srli t3, a3, 56
 ; RV64I-NEXT:    srli t4, a3, 48
 ; RV64I-NEXT:    srli t5, a3, 40
@@ -5249,19 +5246,19 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    srli s4, a1, 16
 ; RV64I-NEXT:    srli s5, a1, 8
 ; RV64I-NEXT:    srli s6, a0, 24
-; RV64I-NEXT:    or a6, a4, a6
-; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb a7, 19(a2)
-; RV64I-NEXT:    srli a4, a0, 16
+; RV64I-NEXT:    srli a5, a0, 16
 ; RV64I-NEXT:    sb t6, 28(a2)
 ; RV64I-NEXT:    sb t5, 29(a2)
 ; RV64I-NEXT:    sb t4, 30(a2)
 ; RV64I-NEXT:    sb t3, 31(a2)
 ; RV64I-NEXT:    srli a7, a0, 8
 ; RV64I-NEXT:    or t0, a0, t0
-; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    or a6, a1, a6
 ; RV64I-NEXT:    sb a3, 24(a2)
 ; RV64I-NEXT:    sb s2, 25(a2)
 ; RV64I-NEXT:    sb s1, 26(a2)
@@ -5272,16 +5269,16 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb s3, 3(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a7, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb s6, 11(a2)
-; RV64I-NEXT:    srli a0, a6, 56
-; RV64I-NEXT:    srli a1, a6, 48
-; RV64I-NEXT:    srli a3, a6, 40
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    srli a7, a5, 48
-; RV64I-NEXT:    srli t1, a5, 40
-; RV64I-NEXT:    srli a5, a5, 32
+; RV64I-NEXT:    srli a0, a4, 56
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    srli a5, a6, 56
+; RV64I-NEXT:    srli a7, a6, 48
+; RV64I-NEXT:    srli t1, a6, 40
+; RV64I-NEXT:    srli a6, a6, 32
 ; RV64I-NEXT:    srli t2, t0, 56
 ; RV64I-NEXT:    srli t3, t0, 48
 ; RV64I-NEXT:    srli t4, t0, 40
@@ -5290,10 +5287,10 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    sb a3, 21(a2)
 ; RV64I-NEXT:    sb a1, 22(a2)
 ; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    sb a6, 4(a2)
 ; RV64I-NEXT:    sb t1, 5(a2)
 ; RV64I-NEXT:    sb a7, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    sb a5, 7(a2)
 ; RV64I-NEXT:    sb t0, 12(a2)
 ; RV64I-NEXT:    sb t4, 13(a2)
 ; RV64I-NEXT:    sb t3, 14(a2)
@@ -5431,13 +5428,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV32I-NEXT:    sw t0, 12(sp)
 ; RV32I-NEXT:    sw t1, 16(sp)
 ; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s6)
-; RV32I-NEXT:    lw a5, 20(s6)
-; RV32I-NEXT:    lw a7, 24(s6)
 ; RV32I-NEXT:    lw a1, 0(s6)
 ; RV32I-NEXT:    lw a0, 4(s6)
 ; RV32I-NEXT:    lw a4, 8(s6)
 ; RV32I-NEXT:    lw a3, 12(s6)
+; RV32I-NEXT:    lw a6, 16(s6)
+; RV32I-NEXT:    lw a5, 20(s6)
+; RV32I-NEXT:    lw a7, 24(s6)
 ; RV32I-NEXT:    lw t0, 28(s6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
@@ -5636,9 +5633,9 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    sd a7, 8(sp)
 ; RV64I-NEXT:    sd a3, 16(sp)
 ; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a4, 16(t5)
-; RV64I-NEXT:    ld a0, 8(t5)
 ; RV64I-NEXT:    ld a1, 0(t5)
+; RV64I-NEXT:    ld a0, 8(t5)
+; RV64I-NEXT:    ld a4, 16(t5)
 ; RV64I-NEXT:    ld a3, 24(t5)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
@@ -5833,13 +5830,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV32I-NEXT:    sw t0, 12(sp)
 ; RV32I-NEXT:    sw t1, 16(sp)
 ; RV32I-NEXT:    sw a5, 20(sp)
-; RV32I-NEXT:    lw a6, 16(s6)
-; RV32I-NEXT:    lw a5, 20(s6)
-; RV32I-NEXT:    lw a7, 24(s6)
 ; RV32I-NEXT:    lw a1, 0(s6)
 ; RV32I-NEXT:    lw a0, 4(s6)
 ; RV32I-NEXT:    lw a4, 8(s6)
 ; RV32I-NEXT:    lw a3, 12(s6)
+; RV32I-NEXT:    lw a6, 16(s6)
+; RV32I-NEXT:    lw a5, 20(s6)
+; RV32I-NEXT:    lw a7, 24(s6)
 ; RV32I-NEXT:    lw t0, 28(s6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index b2c130c2d7c10..f02ffa8951ad7 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -5,12 +5,12 @@
 define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
@@ -28,26 +28,26 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -66,12 +66,12 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
@@ -89,26 +89,26 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -127,12 +127,12 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
@@ -150,26 +150,26 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_4bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -215,20 +215,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -253,39 +253,39 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a5, 7(a0)
-; RV32I-NEXT:    lbu a6, 4(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a3, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a4, a3
-; RV32I-NEXT:    or a4, a1, a6
-; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a5, a5, a3
+; RV32I-NEXT:    or a4, a1, a4
 ; RV32I-NEXT:    srl a1, a5, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a7, 1(a0)
 ; RV32I-NEXT:    lbu t0, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
@@ -348,20 +348,20 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -386,39 +386,39 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
-; RV32I-NEXT:    lbu a5, 3(a0)
-; RV32I-NEXT:    lbu a6, 0(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a3, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a4, a3
-; RV32I-NEXT:    or a4, a1, a6
-; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a5, a5, a3
+; RV32I-NEXT:    or a4, a1, a4
 ; RV32I-NEXT:    sll a1, a5, a4
+; RV32I-NEXT:    addi a3, a4, -32
 ; RV32I-NEXT:    bltz a3, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB4_3
 ; RV32I-NEXT:  .LBB4_2:
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
 ; RV32I-NEXT:    lbu t0, 6(a0)
 ; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
@@ -481,20 +481,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    lbu t2, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    or a4, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
@@ -519,41 +519,40 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_8bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 6(a0)
 ; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    lbu a4, 2(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, a4
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    slli a4, a5, 16
 ; RV32I-NEXT:    slli a5, a6, 24
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    or a4, a4, a3
 ; RV32I-NEXT:    or a3, a1, a7
-; RV32I-NEXT:    addi a6, a3, -32
 ; RV32I-NEXT:    sra a1, a4, a3
+; RV32I-NEXT:    addi a6, a3, -32
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a5, a5, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a7
@@ -615,53 +614,53 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a1, a6
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    lbu t1, 0(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 4(a0)
-; RV64I-NEXT:    lbu t2, 5(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    srl a0, a0, a4
@@ -740,20 +739,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or t3, t4, t3
 ; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t1, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
 ; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 2(a1)
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t4, t4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    mv t2, sp
 ; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, t0, a7
@@ -767,11 +766,11 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    add a0, t2, a0
 ; RV32I-NEXT:    lw a4, 0(a0)
 ; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a6, 8(a0)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a0, 12(a0)
 ; RV32I-NEXT:    srl a7, a5, a1
 ; RV32I-NEXT:    slli t0, a6, 1
@@ -851,53 +850,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, t1, a5
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a4, a3
 ; RV64I-NEXT:    or a4, a1, a6
-; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    addi a3, a4, -64
 ; RV64I-NEXT:    bltz a3, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 10(a0)
-; RV64I-NEXT:    lbu t0, 11(a0)
-; RV64I-NEXT:    lbu t1, 8(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, a6, t1
-; RV64I-NEXT:    lbu t1, 12(a0)
-; RV64I-NEXT:    lbu t2, 13(a0)
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
+; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    lbu t0, 12(a0)
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    lbu t2, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t2
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    not a7, a4
 ; RV64I-NEXT:    srli a5, a5, 1
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    sll a0, a0, a4
@@ -976,20 +975,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or t3, t4, t3
 ; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t1, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
 ; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 2(a1)
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t4, t4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    addi t2, sp, 16
 ; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    or a4, t0, a7
@@ -1003,12 +1002,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sub a0, t2, a0
 ; RV32I-NEXT:    lw a4, 0(a0)
 ; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a6, 8(a0)
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sll a7, a5, a1
 ; RV32I-NEXT:    srli t0, a4, 1
 ; RV32I-NEXT:    sll a0, a0, a1
@@ -1087,55 +1086,54 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli t3, t3, 24
 ; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    or a7, t3, t0
+; RV64I-NEXT:    lbu t0, 4(a1)
 ; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or t0, t3, t0
 ; RV64I-NEXT:    lbu t3, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or t0, t2, t0
 ; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a5, t1, a5
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a4, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a4, a4, a3
 ; RV64I-NEXT:    or a3, a1, a6
-; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    sra a1, a4, a3
+; RV64I-NEXT:    addi a6, a3, -64
 ; RV64I-NEXT:    bltz a6, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    sraiw a1, a5, 31
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a5, 1(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    lbu t0, 0(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a5, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    not a6, a3
 ; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    srl a0, a0, a3
@@ -1209,26 +1207,26 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t1, t1, 8
 ; RV32I-NEXT:    or a4, t3, a4
 ; RV32I-NEXT:    or t3, t5, t4
-; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, t5
 ; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    mv a5, sp
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t1, a0, t2
+; RV32I-NEXT:    or t2, a0, t2
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    or a4, t3, a4
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, t2, t0
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
@@ -1240,11 +1238,11 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    add a0, a5, a0
 ; RV32I-NEXT:    lw a4, 0(a0)
 ; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a6, 8(a0)
-; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a0, 12(a0)
 ; RV32I-NEXT:    srl a7, a5, a1
 ; RV32I-NEXT:    slli t0, a6, 1
@@ -1392,13 +1390,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, s7
@@ -1415,8 +1413,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t1, s0, t6
 ; RV64I-NEXT:    or t2, s5, s1
-; RV64I-NEXT:    or t3, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a3, a3, 32
 ; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -1434,11 +1432,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    add a0, a6, a0
 ; RV64I-NEXT:    ld a4, 0(a0)
 ; RV64I-NEXT:    ld a5, 8(a0)
 ; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    ld a0, 24(a0)
 ; RV64I-NEXT:    srl a7, a5, a1
 ; RV64I-NEXT:    slli t0, a6, 1
@@ -1868,13 +1866,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, s7
@@ -1891,8 +1889,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t1, s0, t6
 ; RV64I-NEXT:    or t2, s5, s1
-; RV64I-NEXT:    or t3, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a3, a3, 32
 ; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -1910,12 +1908,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    sub a0, a6, a0
 ; RV64I-NEXT:    ld a4, 0(a0)
 ; RV64I-NEXT:    ld a5, 8(a0)
 ; RV64I-NEXT:    ld a6, 16(a0)
 ; RV64I-NEXT:    ld a0, 24(a0)
-; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    sll a7, a5, a1
 ; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    sll t1, a0, a1
@@ -2344,13 +2342,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s7, s7, 24
 ; RV64I-NEXT:    or s5, s6, s5
 ; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 4(a1)
 ; RV64I-NEXT:    lbu s6, 5(a1)
-; RV64I-NEXT:    or s4, s7, s4
 ; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli s6, s6, 8
-; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    or s4, s6, s4
 ; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, s7
@@ -2363,8 +2361,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t5
 ; RV64I-NEXT:    or t0, s0, t6
 ; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s4, s2
-; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    slli a0, a0, 32
@@ -2387,11 +2385,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    add a0, s6, a0
 ; RV64I-NEXT:    ld a4, 0(a0)
 ; RV64I-NEXT:    ld a5, 8(a0)
 ; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    xori a3, a3, 63
 ; RV64I-NEXT:    ld a0, 24(a0)
 ; RV64I-NEXT:    srl a7, a5, a1
 ; RV64I-NEXT:    slli t0, a6, 1
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index a30593d7d7afb..a496699f7e386 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1713,8 +1713,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 ; RV64-NEXT:    mulhu a0, a0, a1
 ; RV64-NEXT:    srli a1, a0, 32
 ; RV64-NEXT:    snez a1, a1
-; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo3.i32:
@@ -1733,8 +1733,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 ; RV64ZBA-NEXT:    mul a3, a0, a1
 ; RV64ZBA-NEXT:    srli a3, a3, 32
 ; RV64ZBA-NEXT:    snez a3, a3
-; RV64ZBA-NEXT:    mulw a0, a0, a1
 ; RV64ZBA-NEXT:    sw a3, 0(a2)
+; RV64ZBA-NEXT:    mulw a0, a0, a1
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo3.i32:
@@ -1753,8 +1753,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 ; RV64ZICOND-NEXT:    mulhu a0, a0, a1
 ; RV64ZICOND-NEXT:    srli a1, a0, 32
 ; RV64ZICOND-NEXT:    snez a1, a1
-; RV64ZICOND-NEXT:    sext.w a0, a0
 ; RV64ZICOND-NEXT:    sw a1, 0(a2)
+; RV64ZICOND-NEXT:    sext.w a0, a0
 ; RV64ZICOND-NEXT:    ret
   %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1)
   %5 = extractvalue { i32, i1 } %4, 1
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index e761fcb736a87..f6b7f97f6525c 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -292,12 +292,12 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) {
 define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) {
 ; RV32XTHEADMEMIDX-LABEL: ldia:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    lw a4, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    lw a5, 0(a0)
+; RV32XTHEADMEMIDX-NEXT:    lw a4, 0(a0)
+; RV32XTHEADMEMIDX-NEXT:    lw a5, 4(a0)
 ; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, -128
-; RV32XTHEADMEMIDX-NEXT:    add a3, a4, a3
-; RV32XTHEADMEMIDX-NEXT:    add a2, a5, a2
-; RV32XTHEADMEMIDX-NEXT:    sltu a4, a2, a5
+; RV32XTHEADMEMIDX-NEXT:    add a3, a5, a3
+; RV32XTHEADMEMIDX-NEXT:    add a2, a4, a2
+; RV32XTHEADMEMIDX-NEXT:    sltu a4, a2, a4
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a4
 ; RV32XTHEADMEMIDX-NEXT:    sw a2, 0(a1)
 ; RV32XTHEADMEMIDX-NEXT:    sw a3, 4(a1)
@@ -859,9 +859,9 @@ define i64 @lrd(ptr %a, i64 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    add a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a2)
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
 ; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
 ; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
@@ -883,8 +883,8 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd_2:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    addi a2, a0, 96
-; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a2, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 100
+; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a2, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    add a0, a2, a2
 ; RV32XTHEADMEMIDX-NEXT:    sltu a2, a0, a2
@@ -909,9 +909,9 @@ define i64 @lurd(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    add a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a2)
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
 ; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
 ; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 3525c40026064..7c940a3966217 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(ptr %a) {
 define i64 @ldd(ptr %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 32(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a2, 36(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a3, 40(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a0, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    add a2, a2, a0
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a1, a3
+; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a2, a1
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:
@@ -245,10 +245,10 @@ define i64 @ld64(ptr %a) {
 define i128 @ld128(ptr %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ld128:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    th.lwd a2, a3, (a1), 1, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.lwd a4, a5, (a1), 0, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.swd a2, a3, (a0), 1, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.swd a4, a5, (a0), 0, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.lwd a2, a3, (a1), 0, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.lwd a4, a5, (a1), 1, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.swd a4, a5, (a0), 1, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.swd a2, a3, (a0), 0, 3
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ld128:
@@ -279,10 +279,10 @@ define void @sd64(ptr %a, i64 %b) {
 define void @sd128(ptr %a, i128 %b) {
 ; RV32XTHEADMEMPAIR-LABEL: sd128:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    th.lwd a2, a3, (a1), 1, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.lwd a4, a5, (a1), 0, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.swd a2, a3, (a0), 1, 3
-; RV32XTHEADMEMPAIR-NEXT:    th.swd a4, a5, (a0), 0, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.lwd a2, a3, (a1), 0, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.lwd a4, a5, (a1), 1, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.swd a4, a5, (a0), 1, 3
+; RV32XTHEADMEMPAIR-NEXT:    th.swd a2, a3, (a0), 0, 3
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: sd128:
diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
index d953d34e2d7b9..1c2eb5ecafbc4 100644
--- a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
@@ -137,13 +137,13 @@ define void @orarray100(ptr %a) {
 ; RV32-NEXT:    addi a1, a1, 1
 ; RV32-NEXT:    add a4, a0, a4
 ; RV32-NEXT:    lw a5, 0(a4)
-; RV32-NEXT:    seqz a6, a1
-; RV32-NEXT:    add a2, a2, a6
-; RV32-NEXT:    xori a6, a1, 100
 ; RV32-NEXT:    orn a5, a5, a3
-; RV32-NEXT:    or a6, a6, a2
 ; RV32-NEXT:    sw a5, 0(a4)
-; RV32-NEXT:    bnez a6, .LBB8_1
+; RV32-NEXT:    seqz a4, a1
+; RV32-NEXT:    xori a5, a1, 100
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    or a5, a5, a2
+; RV32-NEXT:    bnez a5, .LBB8_1
 ; RV32-NEXT:  # %bb.2: # %for.cond.cleanup
 ; RV32-NEXT:    ret
 ;
@@ -180,16 +180,16 @@ for.body:
 define void @orarray3(ptr %a) {
 ; CHECK-LABEL: orarray3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a2, 4(a0)
-; CHECK-NEXT:    lw a3, 8(a0)
-; CHECK-NEXT:    lui a4, 1048560
-; CHECK-NEXT:    orn a1, a1, a4
-; CHECK-NEXT:    orn a2, a2, a4
-; CHECK-NEXT:    orn a3, a3, a4
-; CHECK-NEXT:    sw a1, 0(a0)
-; CHECK-NEXT:    sw a2, 4(a0)
-; CHECK-NEXT:    sw a3, 8(a0)
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    lw a3, 4(a0)
+; CHECK-NEXT:    lw a4, 8(a0)
+; CHECK-NEXT:    orn a2, a2, a1
+; CHECK-NEXT:    orn a3, a3, a1
+; CHECK-NEXT:    orn a1, a4, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a1, 8(a0)
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr %a, align 4
   %or = or i32 %1, 65535
diff --git a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
index b7d7d4c0945b6..d9f6e1a5820c8 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
@@ -10,11 +10,11 @@ define dso_local void @zdinx_asm(ptr nocapture noundef writeonly %a, double noun
 ; CHECK-LABEL: zdinx_asm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    mv a6, a1
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    fsgnjx.d a2, a6, a4
+; CHECK-NEXT:    fsgnjx.d a2, a2, a4
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    sw a2, 8(a0)
 ; CHECK-NEXT:    sw a3, 12(a0)
@@ -30,11 +30,11 @@ define dso_local void @zdinx_asm_R(ptr nocapture noundef writeonly %a, double no
 ; CHECK-LABEL: zdinx_asm_R:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    mv a6, a1
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    fsgnjx.d a2, a6, a4
+; CHECK-NEXT:    fsgnjx.d a2, a2, a4
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    sw a2, 8(a0)
 ; CHECK-NEXT:    sw a3, 12(a0)
@@ -133,21 +133,15 @@ entry:
 define dso_local void @zdinx_asm_cr(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind {
 ; CHECK-LABEL: zdinx_asm_cr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    mv s1, a2
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    mv s0, a1
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    fsgnjx.d a2, s0, a4
+; CHECK-NEXT:    fsgnjx.d a2, a2, a4
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    sw a2, 8(a0)
 ; CHECK-NEXT:    sw a3, 12(a0)
-; CHECK-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds double, ptr %a, i32 1
@@ -189,21 +183,15 @@ entry:
 define dso_local void @zdinx_asm_cR(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind {
 ; CHECK-LABEL: zdinx_asm_cR:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; CHECK-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    mv s1, a2
 ; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    mv s0, a1
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    fsgnjx.d a2, s0, a4
+; CHECK-NEXT:    fsgnjx.d a2, a2, a4
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    sw a2, 8(a0)
 ; CHECK-NEXT:    sw a3, 12(a0)
-; CHECK-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds double, ptr %a, i32 1
diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
index 9a312d9daca8d..05af53bf8a2b4 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
@@ -39,9 +39,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind {
 ; RV32ZDINX-LABEL: foo2:
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    mv a3, a2
-; RV32ZDINX-NEXT:    addi a0, a0, 2047
 ; RV32ZDINX-NEXT:    mv a2, a1
 ; RV32ZDINX-NEXT:    fadd.d a2, a2, a2
+; RV32ZDINX-NEXT:    addi a0, a0, 2047
 ; RV32ZDINX-NEXT:    sw a2, -3(a0)
 ; RV32ZDINX-NEXT:    sw a3, 1(a0)
 ; RV32ZDINX-NEXT:    ret
@@ -49,9 +49,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind {
 ; RV32ZDINXUALIGNED-LABEL: foo2:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    mv a3, a2
-; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
 ; RV32ZDINXUALIGNED-NEXT:    mv a2, a1
 ; RV32ZDINXUALIGNED-NEXT:    fadd.d a2, a2, a2
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
 ; RV32ZDINXUALIGNED-NEXT:    sw a2, -3(a0)
 ; RV32ZDINXUALIGNED-NEXT:    sw a3, 1(a0)
 ; RV32ZDINXUALIGNED-NEXT:    ret
@@ -108,36 +108,36 @@ define void @foo4(ptr %p) nounwind {
 ; RV32ZDINX-LABEL: foo4:
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    addi sp, sp, -16
-; RV32ZDINX-NEXT:    addi a1, a0, 2047
-; RV32ZDINX-NEXT:    lw a2, -3(a1)
-; RV32ZDINX-NEXT:    lw a3, 1(a1)
 ; RV32ZDINX-NEXT:    sw a0, 8(sp)
-; RV32ZDINX-NEXT:    lui a0, %hi(d)
-; RV32ZDINX-NEXT:    sw a2, %lo(d)(a0)
-; RV32ZDINX-NEXT:    sw a3, %lo(d+4)(a0)
+; RV32ZDINX-NEXT:    addi a0, a0, 2047
+; RV32ZDINX-NEXT:    lw a1, 1(a0)
+; RV32ZDINX-NEXT:    lw a0, -3(a0)
+; RV32ZDINX-NEXT:    lui a2, %hi(d)
+; RV32ZDINX-NEXT:    sw a0, %lo(d)(a2)
+; RV32ZDINX-NEXT:    sw a1, %lo(d+4)(a2)
 ; RV32ZDINX-NEXT:    addi sp, sp, 16
 ; RV32ZDINX-NEXT:    ret
 ;
 ; RV32ZDINXUALIGNED-LABEL: foo4:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, -16
-; RV32ZDINXUALIGNED-NEXT:    addi a1, a0, 2047
-; RV32ZDINXUALIGNED-NEXT:    lw a2, -3(a1)
-; RV32ZDINXUALIGNED-NEXT:    lw a3, 1(a1)
 ; RV32ZDINXUALIGNED-NEXT:    sw a0, 8(sp)
-; RV32ZDINXUALIGNED-NEXT:    lui a0, %hi(d)
-; RV32ZDINXUALIGNED-NEXT:    sw a2, %lo(d)(a0)
-; RV32ZDINXUALIGNED-NEXT:    sw a3, %lo(d+4)(a0)
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
+; RV32ZDINXUALIGNED-NEXT:    lw a1, 1(a0)
+; RV32ZDINXUALIGNED-NEXT:    lw a0, -3(a0)
+; RV32ZDINXUALIGNED-NEXT:    lui a2, %hi(d)
+; RV32ZDINXUALIGNED-NEXT:    sw a0, %lo(d)(a2)
+; RV32ZDINXUALIGNED-NEXT:    sw a1, %lo(d+4)(a2)
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, 16
 ; RV32ZDINXUALIGNED-NEXT:    ret
 ;
 ; RV64ZDINX-LABEL: foo4:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    addi sp, sp, -16
-; RV64ZDINX-NEXT:    ld a1, 2044(a0)
 ; RV64ZDINX-NEXT:    sd a0, 8(sp)
-; RV64ZDINX-NEXT:    lui a0, %hi(d)
-; RV64ZDINX-NEXT:    sd a1, %lo(d)(a0)
+; RV64ZDINX-NEXT:    ld a0, 2044(a0)
+; RV64ZDINX-NEXT:    lui a1, %hi(d)
+; RV64ZDINX-NEXT:    sd a0, %lo(d)(a1)
 ; RV64ZDINX-NEXT:    addi sp, sp, 16
 ; RV64ZDINX-NEXT:    ret
 entry:
@@ -184,10 +184,10 @@ define void @foo6(ptr %p, double %d) nounwind {
 ; RV32ZDINX-LABEL: foo6:
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    mv a3, a2
-; RV32ZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32ZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
-; RV32ZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
 ; RV32ZDINX-NEXT:    mv a2, a1
+; RV32ZDINX-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32ZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a1)
+; RV32ZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a1)
 ; RV32ZDINX-NEXT:    fadd.d a2, a2, a4
 ; RV32ZDINX-NEXT:    addi a0, a0, 2047
 ; RV32ZDINX-NEXT:    sw a2, -3(a0)
@@ -197,10 +197,10 @@ define void @foo6(ptr %p, double %d) nounwind {
 ; RV32ZDINXUALIGNED-LABEL: foo6:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    mv a3, a2
-; RV32ZDINXUALIGNED-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32ZDINXUALIGNED-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
-; RV32ZDINXUALIGNED-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
 ; RV32ZDINXUALIGNED-NEXT:    mv a2, a1
+; RV32ZDINXUALIGNED-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32ZDINXUALIGNED-NEXT:    lw a4, %lo(.LCPI5_0)(a1)
+; RV32ZDINXUALIGNED-NEXT:    lw a5, %lo(.LCPI5_0+4)(a1)
 ; RV32ZDINXUALIGNED-NEXT:    fadd.d a2, a2, a4
 ; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
 ; RV32ZDINXUALIGNED-NEXT:    sw a2, -3(a0)
@@ -226,10 +226,10 @@ define void @foo7(ptr nocapture %p) nounwind {
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    addi sp, sp, -16
 ; RV32ZDINX-NEXT:    lui a1, %hi(d)
-; RV32ZDINX-NEXT:    lw a2, %lo(d+4)(a1)
-; RV32ZDINX-NEXT:    addi a1, a1, %lo(d)
-; RV32ZDINX-NEXT:    sw a2, 8(sp)
-; RV32ZDINX-NEXT:    lw a1, 8(a1)
+; RV32ZDINX-NEXT:    addi a2, a1, %lo(d)
+; RV32ZDINX-NEXT:    lw a1, %lo(d+4)(a1)
+; RV32ZDINX-NEXT:    sw a1, 8(sp)
+; RV32ZDINX-NEXT:    lw a1, 8(a2)
 ; RV32ZDINX-NEXT:    sw a1, 12(sp)
 ; RV32ZDINX-NEXT:    lw a2, 8(sp)
 ; RV32ZDINX-NEXT:    lw a3, 12(sp)
@@ -254,8 +254,8 @@ define void @foo7(ptr nocapture %p) nounwind {
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a1, %hi(d)
 ; RV64ZDINX-NEXT:    addi a2, a1, %lo(d)
-; RV64ZDINX-NEXT:    lwu a2, 8(a2)
 ; RV64ZDINX-NEXT:    lwu a1, %lo(d+4)(a1)
+; RV64ZDINX-NEXT:    lwu a2, 8(a2)
 ; RV64ZDINX-NEXT:    slli a2, a2, 32
 ; RV64ZDINX-NEXT:    or a1, a2, a1
 ; RV64ZDINX-NEXT:    sd a1, 2044(a0)
@@ -272,45 +272,45 @@ define void @foo8(ptr %p) nounwind {
 ; RV32ZDINX-LABEL: foo8:
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    addi sp, sp, -16
-; RV32ZDINX-NEXT:    addi a1, a0, 2047
-; RV32ZDINX-NEXT:    lw a2, -3(a1)
-; RV32ZDINX-NEXT:    lw a3, 1(a1)
 ; RV32ZDINX-NEXT:    sw a0, 8(sp)
-; RV32ZDINX-NEXT:    sw a2, 0(sp)
-; RV32ZDINX-NEXT:    sw a3, 4(sp)
+; RV32ZDINX-NEXT:    addi a0, a0, 2047
+; RV32ZDINX-NEXT:    lw a1, 1(a0)
+; RV32ZDINX-NEXT:    lw a0, -3(a0)
+; RV32ZDINX-NEXT:    lui a2, %hi(d)
+; RV32ZDINX-NEXT:    addi a3, a2, %lo(d)
+; RV32ZDINX-NEXT:    sw a0, 0(sp)
+; RV32ZDINX-NEXT:    sw a1, 4(sp)
 ; RV32ZDINX-NEXT:    lw a0, 4(sp)
-; RV32ZDINX-NEXT:    lui a1, %hi(d)
-; RV32ZDINX-NEXT:    addi a2, a1, %lo(d)
-; RV32ZDINX-NEXT:    sw a0, 8(a2)
+; RV32ZDINX-NEXT:    sw a0, 8(a3)
 ; RV32ZDINX-NEXT:    lw a0, 0(sp)
-; RV32ZDINX-NEXT:    sw a0, %lo(d+4)(a1)
+; RV32ZDINX-NEXT:    sw a0, %lo(d+4)(a2)
 ; RV32ZDINX-NEXT:    addi sp, sp, 16
 ; RV32ZDINX-NEXT:    ret
 ;
 ; RV32ZDINXUALIGNED-LABEL: foo8:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, -16
-; RV32ZDINXUALIGNED-NEXT:    addi a1, a0, 2047
-; RV32ZDINXUALIGNED-NEXT:    lw a2, -3(a1)
-; RV32ZDINXUALIGNED-NEXT:    lw a3, 1(a1)
 ; RV32ZDINXUALIGNED-NEXT:    sw a0, 8(sp)
-; RV32ZDINXUALIGNED-NEXT:    lui a0, %hi(d)
-; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, %lo(d)
-; RV32ZDINXUALIGNED-NEXT:    sw a2, 4(a0)
-; RV32ZDINXUALIGNED-NEXT:    sw a3, 8(a0)
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
+; RV32ZDINXUALIGNED-NEXT:    lw a1, 1(a0)
+; RV32ZDINXUALIGNED-NEXT:    lw a0, -3(a0)
+; RV32ZDINXUALIGNED-NEXT:    lui a2, %hi(d)
+; RV32ZDINXUALIGNED-NEXT:    addi a2, a2, %lo(d)
+; RV32ZDINXUALIGNED-NEXT:    sw a0, 4(a2)
+; RV32ZDINXUALIGNED-NEXT:    sw a1, 8(a2)
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, 16
 ; RV32ZDINXUALIGNED-NEXT:    ret
 ;
 ; RV64ZDINX-LABEL: foo8:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    addi sp, sp, -16
-; RV64ZDINX-NEXT:    ld a1, 2044(a0)
 ; RV64ZDINX-NEXT:    sd a0, 8(sp)
-; RV64ZDINX-NEXT:    lui a0, %hi(d)
-; RV64ZDINX-NEXT:    addi a2, a0, %lo(d)
-; RV64ZDINX-NEXT:    sw a1, %lo(d+4)(a0)
-; RV64ZDINX-NEXT:    srli a1, a1, 32
-; RV64ZDINX-NEXT:    sw a1, 8(a2)
+; RV64ZDINX-NEXT:    ld a0, 2044(a0)
+; RV64ZDINX-NEXT:    lui a1, %hi(d)
+; RV64ZDINX-NEXT:    addi a2, a1, %lo(d)
+; RV64ZDINX-NEXT:    sw a0, %lo(d+4)(a1)
+; RV64ZDINX-NEXT:    srli a0, a0, 32
+; RV64ZDINX-NEXT:    sw a0, 8(a2)
 ; RV64ZDINX-NEXT:    addi sp, sp, 16
 ; RV64ZDINX-NEXT:    ret
 entry:
@@ -358,11 +358,11 @@ define void @foo9(ptr nocapture %p) nounwind {
 ; RV64ZDINX-LABEL: foo9:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a1, %hi(e)
-; RV64ZDINX-NEXT:    addi a2, a1, %lo(e)
-; RV64ZDINX-NEXT:    lwu a2, 4(a2)
-; RV64ZDINX-NEXT:    lwu a1, %lo(e)(a1)
-; RV64ZDINX-NEXT:    slli a2, a2, 32
-; RV64ZDINX-NEXT:    or a1, a2, a1
+; RV64ZDINX-NEXT:    lwu a2, %lo(e)(a1)
+; RV64ZDINX-NEXT:    addi a1, a1, %lo(e)
+; RV64ZDINX-NEXT:    lwu a1, 4(a1)
+; RV64ZDINX-NEXT:    slli a1, a1, 32
+; RV64ZDINX-NEXT:    or a1, a1, a2
 ; RV64ZDINX-NEXT:    sd a1, 2044(a0)
 ; RV64ZDINX-NEXT:    ret
 entry:
@@ -380,41 +380,41 @@ define void @foo10(ptr %p) nounwind {
 ; RV32ZDINX-NEXT:    lw a2, -3(a1)
 ; RV32ZDINX-NEXT:    lw a3, 1(a1)
 ; RV32ZDINX-NEXT:    sw a0, 8(sp)
+; RV32ZDINX-NEXT:    lui a0, %hi(e)
 ; RV32ZDINX-NEXT:    sw a2, 0(sp)
 ; RV32ZDINX-NEXT:    sw a3, 4(sp)
-; RV32ZDINX-NEXT:    lw a0, 4(sp)
-; RV32ZDINX-NEXT:    lui a1, %hi(e)
-; RV32ZDINX-NEXT:    addi a2, a1, %lo(e)
-; RV32ZDINX-NEXT:    sw a0, 4(a2)
-; RV32ZDINX-NEXT:    lw a0, 0(sp)
-; RV32ZDINX-NEXT:    sw a0, %lo(e)(a1)
+; RV32ZDINX-NEXT:    addi a1, a0, %lo(e)
+; RV32ZDINX-NEXT:    lw a2, 4(sp)
+; RV32ZDINX-NEXT:    sw a2, 4(a1)
+; RV32ZDINX-NEXT:    lw a1, 0(sp)
+; RV32ZDINX-NEXT:    sw a1, %lo(e)(a0)
 ; RV32ZDINX-NEXT:    addi sp, sp, 16
 ; RV32ZDINX-NEXT:    ret
 ;
 ; RV32ZDINXUALIGNED-LABEL: foo10:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, -16
-; RV32ZDINXUALIGNED-NEXT:    addi a1, a0, 2047
-; RV32ZDINXUALIGNED-NEXT:    lw a2, -3(a1)
-; RV32ZDINXUALIGNED-NEXT:    lw a3, 1(a1)
 ; RV32ZDINXUALIGNED-NEXT:    sw a0, 8(sp)
-; RV32ZDINXUALIGNED-NEXT:    lui a0, %hi(e)
-; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, %lo(e)
-; RV32ZDINXUALIGNED-NEXT:    sw a2, 0(a0)
-; RV32ZDINXUALIGNED-NEXT:    sw a3, 4(a0)
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
+; RV32ZDINXUALIGNED-NEXT:    lw a1, 1(a0)
+; RV32ZDINXUALIGNED-NEXT:    lw a0, -3(a0)
+; RV32ZDINXUALIGNED-NEXT:    lui a2, %hi(e)
+; RV32ZDINXUALIGNED-NEXT:    addi a2, a2, %lo(e)
+; RV32ZDINXUALIGNED-NEXT:    sw a0, 0(a2)
+; RV32ZDINXUALIGNED-NEXT:    sw a1, 4(a2)
 ; RV32ZDINXUALIGNED-NEXT:    addi sp, sp, 16
 ; RV32ZDINXUALIGNED-NEXT:    ret
 ;
 ; RV64ZDINX-LABEL: foo10:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    addi sp, sp, -16
-; RV64ZDINX-NEXT:    ld a1, 2044(a0)
 ; RV64ZDINX-NEXT:    sd a0, 8(sp)
-; RV64ZDINX-NEXT:    lui a0, %hi(e)
-; RV64ZDINX-NEXT:    sw a1, %lo(e)(a0)
-; RV64ZDINX-NEXT:    addi a0, a0, %lo(e)
-; RV64ZDINX-NEXT:    srli a1, a1, 32
-; RV64ZDINX-NEXT:    sw a1, 4(a0)
+; RV64ZDINX-NEXT:    ld a0, 2044(a0)
+; RV64ZDINX-NEXT:    lui a1, %hi(e)
+; RV64ZDINX-NEXT:    sw a0, %lo(e)(a1)
+; RV64ZDINX-NEXT:    addi a1, a1, %lo(e)
+; RV64ZDINX-NEXT:    srli a0, a0, 32
+; RV64ZDINX-NEXT:    sw a0, 4(a1)
 ; RV64ZDINX-NEXT:    addi sp, sp, 16
 ; RV64ZDINX-NEXT:    ret
 entry:
@@ -521,10 +521,10 @@ define double @foo13(ptr nocapture %p) nounwind {
 ; RV64ZDINX-LABEL: foo13:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a0, %hi(f)
-; RV64ZDINX-NEXT:    lwu a1, %lo(f+8)(a0)
-; RV64ZDINX-NEXT:    lwu a0, %lo(f+4)(a0)
-; RV64ZDINX-NEXT:    slli a1, a1, 32
-; RV64ZDINX-NEXT:    or a0, a1, a0
+; RV64ZDINX-NEXT:    lwu a1, %lo(f+4)(a0)
+; RV64ZDINX-NEXT:    lwu a0, %lo(f+8)(a0)
+; RV64ZDINX-NEXT:    slli a0, a0, 32
+; RV64ZDINX-NEXT:    or a0, a0, a1
 ; RV64ZDINX-NEXT:    ret
 entry:
   %add.ptr = getelementptr inbounds i8, ptr @f, i64 4



More information about the llvm-commits mailing list